Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add architecture for record linkage #160

Closed
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Generated by Django 4.2.7 on 2024-06-23 20:18

import django.contrib.postgres.indexes
import django.contrib.postgres.search
from django.contrib.postgres.search import SearchVector
from django.db import migrations
import pgtrigger.compiler
import pgtrigger.migrations

from shared.models import AffectedProduct, Container, Cpe, Description


def update_search_vectors(apps, schema_editor):
Container.objects.update(search_vector=SearchVector("title"))
Description.objects.update(search_vector=SearchVector("value"))
AffectedProduct.objects.update(search_vector=SearchVector("vendor", "product", "package_name", "repo"))
Cpe.objects.update(search_vector=SearchVector("name"))

class Migration(migrations.Migration):

dependencies = [
('shared', '0024_alter_nixlicense_unique_together_and_more'),
]

operations = [
migrations.AddField(
model_name='affectedproduct',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddField(
model_name='container',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddField(
model_name='cpe',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddField(
model_name='description',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddIndex(
model_name='affectedproduct',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='shared_affe_search__aa6eef_gin'),
),
migrations.AddIndex(
model_name='container',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='shared_cont_search__979c85_gin'),
),
migrations.AddIndex(
model_name='cpe',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='shared_cpe_search__347dd9_gin'),
),
migrations.AddIndex(
model_name='description',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='shared_desc_search__dd1c6d_gin'),
),
pgtrigger.migrations.AddTrigger(
model_name='affectedproduct',
trigger=pgtrigger.compiler.Trigger(name='affected_search_vector', sql=pgtrigger.compiler.UpsertTriggerSql(execute='tsvector_update_trigger("search_vector", "pg_catalog.english", "vendor", "product", "package_name", "repo")', func='', hash='27c6a9539dba25b1c7641933f15dc17346a04b87', operation='INSERT OR UPDATE OF "vendor", "product", "package_name", "repo"', pgid='pgtrigger_affected_search_vector_25927', table='shared_affectedproduct', when='BEFORE')),
),
pgtrigger.migrations.AddTrigger(
model_name='container',
trigger=pgtrigger.compiler.Trigger(name='cve_container_search_vector', sql=pgtrigger.compiler.UpsertTriggerSql(execute='tsvector_update_trigger("search_vector", "pg_catalog.english", "title")', func='', hash='26d3e8e55e86b059c13aa7375180c06c7c3cfc5e', operation='INSERT OR UPDATE OF "title"', pgid='pgtrigger_cve_container_search_vector_85378', table='shared_container', when='BEFORE')),
),
pgtrigger.migrations.AddTrigger(
model_name='cpe',
trigger=pgtrigger.compiler.Trigger(name='cpe_search_vector_idx', sql=pgtrigger.compiler.UpsertTriggerSql(execute='tsvector_update_trigger("search_vector", "pg_catalog.english", "name")', func='', hash='5831e70cff7886047233ef2d3b870a320fa5d81d', operation='INSERT OR UPDATE OF "name"', pgid='pgtrigger_cpe_search_vector_idx_80861', table='shared_cpe', when='BEFORE')),
),
pgtrigger.migrations.AddTrigger(
model_name='description',
trigger=pgtrigger.compiler.Trigger(name='description_search_vector_idx', sql=pgtrigger.compiler.UpsertTriggerSql(execute='tsvector_update_trigger("search_vector", "pg_catalog.english", "value")', func='', hash='5b6196ba33f28ffdb9dcfff123f0ea2fe94588fd', operation='INSERT OR UPDATE OF "value"', pgid='pgtrigger_description_search_vector_idx_ce47a', table='shared_description', when='BEFORE')),
),
# Make sure that the VectorFields are populated when there's data before this migration runs
migrations.RunPython(
update_search_vectors, reverse_code=migrations.RunPython.noop
RaitoBezarius marked this conversation as resolved.
Show resolved Hide resolved
)
]
77 changes: 77 additions & 0 deletions src/website/shared/models/cve.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from typing import Any

from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVectorField
from django.core.validators import RegexValidator
from django.db import models
from django.db.models.signals import post_save
from django.dispatch import receiver
from django.utils.translation import gettext_lazy as _
from pgtrigger import UpdateSearchVector

from .nix_evaluation import NixDerivation

Expand Down Expand Up @@ -86,9 +89,27 @@ class Description(models.Model):
value = models.TextField()
media = models.ManyToManyField(SupportingMedia)

search_vector = SearchVectorField(null=True)

def __str__(self) -> str:
return f"{self.value[:32]}..."

class Meta:
indexes = [
# Add a GIN index to speed up vector search queries
GinIndex(fields=["search_vector"]),
]
triggers = [
# Add a trigger to maintain the search vector updated with row changes
UpdateSearchVector(
name="description_search_vector_idx",
vector_field="search_vector",
document_fields=[
"value",
],
)
]


class Tag(models.Model):
"""Class representing a tag related to a CVE record."""
Expand Down Expand Up @@ -198,6 +219,22 @@ class Cpe(models.Model):
],
)

search_vector = SearchVectorField(null=True)

class Meta:
indexes = [
# Add a GIN index to speed up vector search queries
GinIndex(fields=["search_vector"]),
]
triggers = [
# Add a trigger to maintain the search vector updated with row changes
UpdateSearchVector(
name="cpe_search_vector_idx",
vector_field="search_vector",
document_fields=["name"],
)
]


class Module(models.Model):
name = models.CharField(max_length=4096)
Expand Down Expand Up @@ -232,6 +269,27 @@ class Status(models.TextChoices):
program_files = models.ManyToManyField(ProgramFile)
program_routines = models.ManyToManyField(ProgramRoutine)

search_vector = SearchVectorField(null=True)

class Meta:
indexes = [
# Add a GIN index to speed up vector search queries
GinIndex(fields=["search_vector"]),
]
triggers = [
# Add a trigger to maintain the search vector updated with row changes
UpdateSearchVector(
name="affected_search_vector",
vector_field="search_vector",
document_fields=[
"vendor",
"product",
"package_name",
"repo",
],
)
]


class Container(models.Model):
"""Class representing a container (i.e. structured data) related to a CVE record."""
Expand Down Expand Up @@ -267,9 +325,28 @@ class Type(models.TextChoices):
credits = models.ManyToManyField(Credit)
source = models.JSONField(default=dict)

# Enable full-text search on CVE searches
search_vector = SearchVectorField(null=True)

def __str__(self) -> str:
return self.cve.cve_id

class Meta:
indexes = [
# Add a GIN index to speed up vector search queries
GinIndex(fields=["search_vector"]),
]
triggers = [
# Add a trigger to maintain the search vector updated with row changes
UpdateSearchVector(
name="cve_container_search_vector",
vector_field="search_vector",
document_fields=[
"title",
],
)
]


###
#
Expand Down
34 changes: 17 additions & 17 deletions src/website/webview/views.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import re
from typing import Any

from django.contrib.postgres.search import SearchVector
from django.db.models.manager import BaseManager
from django.shortcuts import get_object_or_404
from django.views.generic import DetailView, ListView, TemplateView
from shared.models import Container, CveRecord, NixpkgsIssue
from django.contrib.postgres.search import SearchVector
from shared.models import (
Container,
CveRecord,
NixpkgsIssue,
)


class HomeView(TemplateView):
Expand All @@ -17,7 +21,7 @@ class TriageView(ListView):
model = Container
paginate_by = 25

def get_queryset(self):
def get_queryset(self) -> BaseManager[Container]:
qs = (
Container.objects.prefetch_related("descriptions", "affected", "cve")
.exclude(title="")
Expand All @@ -27,21 +31,17 @@ def get_queryset(self):
if not search_query:
return qs.all()
else:
return (
qs.annotate(
search=SearchVector(
"title",
"descriptions__value",
"affected__vendor",
"affected__product",
"affected__package_name",
"affected__repo",
"affected__cpes__name",
)
return qs.annotate(
search=SearchVector(
"title",
"descriptions__value",
"affected__vendor",
"affected__product",
"affected__package_name",
"affected__repo",
"affected__cpes__name",
)
.filter(search=search_query)
.distinct("id")
)
).distinct("id")


class NixpkgsIssueView(DetailView):
Expand Down
Loading