Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add architecture for record linkage #160

Closed
2 changes: 2 additions & 0 deletions nix/overlay.nix
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ in
django-debug-toolbar
django-filter
django-types
django-pandas
django_4
djangorestframework
ipython
Expand All @@ -53,6 +54,7 @@ in
daphne
channels
aiofiles
recordlinkage
];

postInstall = ''
Expand Down
40 changes: 40 additions & 0 deletions pkgs/python/django-pandas/default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
lib,
buildPythonPackage,
fetchFromGitHub,
setuptools,
django,
pandas,
six,
}:

buildPythonPackage rec {
pname = "django-pandas";
version = "0.6.7";
pyproject = true;

src = fetchFromGitHub {
owner = "chrisdev";
repo = "django-pandas";
rev = version;
hash = "sha256-GJb9qNlaxOz/q2yyQP2UDslT6y8xFjMX+W5EJdK3dEs=";
};

nativeBuildInputs = [ setuptools ];

propagatedBuildInputs = [
django
pandas
six
];

pythonImportsCheck = [ "django_pandas" ];

meta = with lib; {
description = "";
homepage = "https://github.com/chrisdev/django-pandas";
# no changelog provided
license = licenses.bsd3;
maintainers = with maintainers; [ ];
};
}
130 changes: 130 additions & 0 deletions src/website/shared/management/commands/feed_random_candidates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import argparse
import logging
from typing import Any

import pandas as pd
from django.core.management.base import BaseCommand
from django.db.models import F
from django_pandas.io import read_frame
from recordlinkage import Index
from shared.models import Container, LinkageCandidate, NixDerivation

logger = logging.getLogger(__name__)


def get_daframes(to_pkg_id: int | None) -> Any:
"""
Return dataframes from the appropriate querysets.
"""
container_qs = (
Container.objects.select_related("descriptions", "affected", "cve")
.exclude(title="")
.order_by("id", "-date_public")
.annotate(container_id=F("id"))
.values(
"container_id",
"title",
"descriptions__value",
"affected__vendor",
"affected__product",
"affected__package_name",
"affected__repo",
"affected__cpes__name",
)
)

pkg_qs = (
NixDerivation.objects.select_related("metadata")
.order_by("id")
.annotate(derivation_id=F("id"))
.values(
"derivation_id",
"attribute",
"name",
"system",
"metadata__name",
"metadata__description",
)
)

if to_pkg_id:
pkg_qs = pkg_qs.filter(id=to_pkg_id)

return read_frame(container_qs), read_frame(pkg_qs)


def provide_candidates(df_a: Any, df_b: Any, n: int) -> Any:
indexer = Index().random(n=n)
candidate_links = indexer.index(df_a, df_b)

return candidate_links


class Command(BaseCommand):
"""
Generate and insert random record linkage candidates.

By providing random record linkage candidates we can quickly:
- validate the triage candidates workflow
- bootstrap supervised training for linkage classification models.
"""

help = "Generate and insert random record linkage candidates."

def add_arguments(self, parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"-n",
"--number-inserts",
nargs="?",
type=int,
help="Integer value representing the N entries to be inserted. "
+ " Useful to generate a small dataset for development. "
+ " Defaults to 200.",
default=200,
)
parser.add_argument(
"-p",
"--pkg-id",
nargs="?",
type=int,
help="Integer value representing the id of the package to insert the candidates. "
+ " Useful to generate feed candidates to a specific pkgs for development.",
default=None,
)

def handle(self, *args: str, **kwargs: Any) -> None: # pyright: ignore reportUnusedVariable
logger.info("Generating candidates.")

container_df, pkg_df = get_daframes(to_pkg_id=kwargs["pkg_id"])
container_ids = container_df.loc[:, "container_id"]
pkg_ids = pkg_df.loc[:, "derivation_id"]

print("\nExample row for container DF:")
print(container_df.iloc[0])

print("\nExample row for pkg DF:")
print(pkg_df.iloc[0])

# Candidates are return as a MultiIndex
candidates = provide_candidates(container_df, pkg_df, kwargs["number_inserts"])
print()
print(candidates)

# Extract each ID pairs from their respective side of the MultiIndex
candidate_container_ids = (
container_ids.loc[candidates.get_level_values(0)]
).reset_index(drop=True)
candidate_pkg_ids = (pkg_ids.loc[candidates.get_level_values(1)]).reset_index(
drop=True
)
id_pairs = pd.concat([candidate_container_ids, candidate_pkg_ids], axis=1)

print("\nCandidates to insert:")
print(id_pairs)

# Insert candidates in bulk
logger.info("Preparing candidates to insert.")
data = id_pairs.to_dict(orient="records")
instances = [LinkageCandidate(**row) for row in data]
LinkageCandidate.objects.bulk_create(instances)
logger.info("%s candidates inserted.", len(instances))
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Generated by Django 4.2.7 on 2024-06-23 20:18

import django.contrib.postgres.indexes
import django.contrib.postgres.search
from django.contrib.postgres.search import SearchVector
from django.db import migrations
import pgtrigger.compiler
import pgtrigger.migrations

from shared.models import AffectedProduct, Container, Cpe, Description


def update_search_vectors(apps, schema_editor):
Container.objects.update(search_vector=SearchVector("title"))
Description.objects.update(search_vector=SearchVector("value"))
AffectedProduct.objects.update(search_vector=SearchVector("vendor", "product", "package_name", "repo"))
Cpe.objects.update(search_vector=SearchVector("name"))

class Migration(migrations.Migration):

dependencies = [
('shared', '0024_alter_nixlicense_unique_together_and_more'),
]

operations = [
migrations.AddField(
model_name='affectedproduct',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddField(
model_name='container',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddField(
model_name='cpe',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddField(
model_name='description',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddIndex(
model_name='affectedproduct',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='shared_affe_search__aa6eef_gin'),
),
migrations.AddIndex(
model_name='container',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='shared_cont_search__979c85_gin'),
),
migrations.AddIndex(
model_name='cpe',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='shared_cpe_search__347dd9_gin'),
),
migrations.AddIndex(
model_name='description',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='shared_desc_search__dd1c6d_gin'),
),
pgtrigger.migrations.AddTrigger(
model_name='affectedproduct',
trigger=pgtrigger.compiler.Trigger(name='affected_search_vector', sql=pgtrigger.compiler.UpsertTriggerSql(execute='tsvector_update_trigger("search_vector", "pg_catalog.english", "vendor", "product", "package_name", "repo")', func='', hash='27c6a9539dba25b1c7641933f15dc17346a04b87', operation='INSERT OR UPDATE OF "vendor", "product", "package_name", "repo"', pgid='pgtrigger_affected_search_vector_25927', table='shared_affectedproduct', when='BEFORE')),
),
pgtrigger.migrations.AddTrigger(
model_name='container',
trigger=pgtrigger.compiler.Trigger(name='cve_container_search_vector', sql=pgtrigger.compiler.UpsertTriggerSql(execute='tsvector_update_trigger("search_vector", "pg_catalog.english", "title")', func='', hash='26d3e8e55e86b059c13aa7375180c06c7c3cfc5e', operation='INSERT OR UPDATE OF "title"', pgid='pgtrigger_cve_container_search_vector_85378', table='shared_container', when='BEFORE')),
),
pgtrigger.migrations.AddTrigger(
model_name='cpe',
trigger=pgtrigger.compiler.Trigger(name='cpe_search_vector_idx', sql=pgtrigger.compiler.UpsertTriggerSql(execute='tsvector_update_trigger("search_vector", "pg_catalog.english", "name")', func='', hash='5831e70cff7886047233ef2d3b870a320fa5d81d', operation='INSERT OR UPDATE OF "name"', pgid='pgtrigger_cpe_search_vector_idx_80861', table='shared_cpe', when='BEFORE')),
),
pgtrigger.migrations.AddTrigger(
model_name='description',
trigger=pgtrigger.compiler.Trigger(name='description_search_vector_idx', sql=pgtrigger.compiler.UpsertTriggerSql(execute='tsvector_update_trigger("search_vector", "pg_catalog.english", "value")', func='', hash='5b6196ba33f28ffdb9dcfff123f0ea2fe94588fd', operation='INSERT OR UPDATE OF "value"', pgid='pgtrigger_description_search_vector_idx_ce47a', table='shared_description', when='BEFORE')),
),
# Make sure that the VectorFields are populated when there's data before this migration runs
migrations.RunPython(
update_search_vectors, reverse_code=migrations.RunPython.noop
)
]
23 changes: 23 additions & 0 deletions src/website/shared/migrations/0026_linkagecandidate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 4.2.7 on 2024-06-25 22:50

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('shared', '0025_affectedproduct_search_vector_and_more'),
]

operations = [
migrations.CreateModel(
name='LinkageCandidate',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('state', models.CharField(choices=[('UNTRIAGED', 'UNTRIAGED'), ('ACCEPTED', 'ACCEPTED'), ('REJECTED', 'REJECTED')], default='UNTRIAGED', max_length=9)),
('container', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='linkage_candidate', to='shared.container')),
('derivation', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='linkage_candidate', to='shared.nixderivation')),
],
),
]
1 change: 1 addition & 0 deletions src/website/shared/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .cve import *
from .nix_evaluation import *
from .record_linkage import *
Loading
Loading