Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new content ID function #1766

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 43 additions & 17 deletions vulnerabilities/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import dataclasses
import datetime
import functools
import logging
import os
import shutil
Expand Down Expand Up @@ -46,7 +47,8 @@
logger = logging.getLogger(__name__)


@dataclasses.dataclass(order=True)
@dataclasses.dataclass(eq=True)
@functools.total_ordering
class VulnerabilitySeverity:
# FIXME: this should be named scoring_system, like in the model
system: ScoringSystem
Expand All @@ -55,15 +57,23 @@ class VulnerabilitySeverity:
published_at: Optional[datetime.datetime] = None

def to_dict(self):
published_at_dict = (
{"published_at": self.published_at.isoformat()} if self.published_at else {}
)
return {
data = {
"system": self.system.identifier,
"value": self.value,
"scoring_elements": self.scoring_elements,
**published_at_dict,
}
if self.published_at:
data["published_at"] = self.published_at.isoformat()
return data

def __lt__(self, other):
if not isinstance(other, VulnerabilitySeverity):
return NotImplemented
return self._cmp_key() < other._cmp_key()

# TODO: Add cache
def _cmp_key(self):
return (self.system.identifier, self.value, self.scoring_elements, self.published_at)

@classmethod
def from_dict(cls, severity: dict):
Expand All @@ -79,7 +89,8 @@ def from_dict(cls, severity: dict):
)


@dataclasses.dataclass(order=True)
@dataclasses.dataclass(eq=True)
@functools.total_ordering
class Reference:
reference_id: str = ""
reference_type: str = ""
Expand All @@ -90,21 +101,22 @@ def __post_init__(self):
if not self.url:
raise TypeError("Reference must have a url")

def normalized(self):
severities = sorted(self.severities)
return Reference(
reference_id=self.reference_id,
url=self.url,
severities=severities,
reference_type=self.reference_type,
)
def __lt__(self, other):
if not isinstance(other, Reference):
return NotImplemented
return self._cmp_key() < other._cmp_key()

# TODO: Add cache
def _cmp_key(self):
return (self.reference_id, self.reference_type, self.url, tuple(self.severities))

def to_dict(self):
"""Return a normalized dictionary representation"""
return {
"reference_id": self.reference_id,
"reference_type": self.reference_type,
"url": self.url,
"severities": [severity.to_dict() for severity in self.severities],
"severities": [severity.to_dict() for severity in sorted(self.severities)],
}

@classmethod
Expand Down Expand Up @@ -140,7 +152,8 @@ class NoAffectedPackages(Exception):
"""


@dataclasses.dataclass(order=True, frozen=True)
@functools.total_ordering
@dataclasses.dataclass(eq=True)
class AffectedPackage:
"""
Relate a Package URL with a range of affected versions and a fixed version.
Expand Down Expand Up @@ -170,6 +183,19 @@ def get_fixed_purl(self):
raise ValueError(f"Affected Package {self.package!r} does not have a fixed version")
return update_purl_version(purl=self.package, version=str(self.fixed_version))

def __lt__(self, other):
if not isinstance(other, AffectedPackage):
return NotImplemented
return self._cmp_key() < other._cmp_key()

# TODO: Add cache
def _cmp_key(self):
return (
str(self.package),
str(self.affected_version_range or ""),
str(self.fixed_version or ""),
)

@classmethod
def merge(
cls, affected_packages: Iterable
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 4.2.16 on 2025-02-12 13:41

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("vulnerabilities", "0088_fix_alpine_purl_type"),
]

operations = [
migrations.AlterField(
model_name="advisory",
name="unique_content_id",
field=models.CharField(
blank=True,
db_index=True,
help_text="A 64 character unique identifier for the content of the advisory since we use sha256 as hex",
max_length=64,
),
),
]
17 changes: 6 additions & 11 deletions vulnerabilities/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
from vulnerabilities import utils
from vulnerabilities.severity_systems import EPSS
from vulnerabilities.severity_systems import SCORING_SYSTEMS
from vulnerabilities.utils import compute_content_id
from vulnerabilities.utils import normalize_purl
from vulnerabilities.utils import purl_to_dict
from vulnerablecode import __version__ as VULNERABLECODE_VERSION
Expand Down Expand Up @@ -1315,8 +1316,10 @@ class Advisory(models.Model):
"""

unique_content_id = models.CharField(
max_length=32,
max_length=64,
TG1999 marked this conversation as resolved.
Show resolved Hide resolved
db_index=True,
blank=True,
help_text="A 64 character unique identifier for the content of the advisory since we use sha256 as hex",
)
aliases = models.JSONField(blank=True, default=list, help_text="A list of alias strings")
summary = models.TextField(
Expand Down Expand Up @@ -1357,16 +1360,8 @@ class Meta:
ordering = ["aliases", "date_published", "unique_content_id"]

def save(self, *args, **kwargs):
checksum = hashlib.md5()
for field in (
self.summary,
self.affected_packages,
self.references,
self.weaknesses,
):
value = json.dumps(field, separators=(",", ":")).encode("utf-8")
checksum.update(value)
self.unique_content_id = checksum.hexdigest()
advisory_data = self.to_advisory_data()
self.unique_content_id = compute_content_id(advisory_data, include_metadata=False)
super().save(*args, **kwargs)

def to_advisory_data(self) -> "AdvisoryData":
Expand Down
83 changes: 83 additions & 0 deletions vulnerabilities/pipelines/remove_duplicate_advisories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging
from itertools import groupby

from aboutcode.pipeline import LoopProgress
from django.db.models import Count
from django.db.models import Q

from vulnerabilities.models import Advisory
from vulnerabilities.pipelines import VulnerableCodePipeline
from vulnerabilities.utils import compute_content_id


class RemoveDuplicateAdvisoriesPipeline(VulnerableCodePipeline):
"""Pipeline to remove duplicate advisories based on their content."""

pipeline_id = "remove_duplicate_advisories"

@classmethod
def steps(cls):
return (
cls.recompute_content_ids,
cls.remove_duplicates,
)

def remove_duplicates(self):
"""
Find advisories with the same content and keep only the latest one.
"""

duplicated_advisories = groupby(
Advisory.objects.order_by("unique_content_id").all().paginated(),
key=lambda x: x.unique_content_id,
)
Comment on lines +39 to +42
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I honestly doubt that this will work smoothly in production, ordering 118 million advisories using unique_content_id, which is an unindexed field is not good idea.

#1766 (comment) would be much more practical approach since id is an autogenerated primary key field and hence it's already indexed, and we can use this to select the latest or oldest advisory.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

db_index is in models for that! And since we believe there are lots of dupes. I believe index to work fine

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're adding this db_index now. I am not sure how long the index creation itself is going to take.

progress = LoopProgress(total_iterations=Advisory.objects.count(), logger=self.log)
for _content_id, advisories in progress.iter(duplicated_advisories):
advisories = list(advisories)
self.log(
f"Removing duplicates for content ID {_content_id} {len(advisories)}",
level=logging.INFO,
)
oldest = min(advisories, key=lambda x: x.date_imported)
try:
advisory_ids = []
for adv in advisories:
if adv.id != oldest.id:
advisory_ids.append(adv.id)
Advisory.objects.filter(id__in=advisory_ids).delete()
except Exception as e:
self.log(f"Error deleting advisories: {e}", level=logging.ERROR)

self.log(
f"Kept advisory {oldest.id} and removed "
f"{len(list(advisories)) - 1} duplicates for content ID {_content_id}",
level=logging.INFO,
)

def recompute_content_ids(self):
"""
Recompute content IDs for all advisories.
"""

advisories = []

progress = LoopProgress(
total_iterations=Advisory.objects.count(),
progress_step=1,
logger=self.log,
)

for advisory in progress.iter(Advisory.objects.all().paginated()):
advisory.unique_content_id = compute_content_id(advisory)
advisories.append(advisory)

Advisory.objects.bulk_update(advisories, ["unique_content_id"], batch_size=1000)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keeping all that advisories in memory is going to be very expensive!
Assuming 1 Advisory object ≈ 1000 bytes.
That would mean for 118 million advisories: (118 * 10^6) * 10^3 bytes ≈ 118 GB of memory!.

We should not keep all these advisories in memory. Instead we should bulk update as soon as we reach the batch size.

3 changes: 3 additions & 0 deletions vulnerabilities/severity_systems.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def compute(self, scoring_elements: str) -> str:
def get(self, scoring_elements: str):
return NotImplementedError

def __str__(self):
return f"{self.identifier}"


@dataclasses.dataclass(order=True)
class Cvssv2ScoringSystem(ScoringSystem):
Expand Down
2 changes: 2 additions & 0 deletions vulnerabilities/tests/test_add_cvsssv31.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def setUp(self):
}
],
"url": "https://nvd.nist.gov/vuln/detail/CVE-2024-1234",
"reference_id": "CVE-2024-1234",
"reference_type": "cve",
}
],
date_collected="2024-09-27T19:38:00Z",
Expand Down
Loading
Loading