Add new content ID function #1766

TG1999 · 2025-01-29T13:38:56Z

Reference: #1583

keshav-space

Thanks @TG1999, see some suggestions below.

keshav-space · 2025-01-31T16:20:22Z

vulnerabilities/utils.py

+    # Normalize fields
+    normalized_data = {
+        "summary": normalize_text(advisory_data.summary),
+        "affected_packages": normalize_list(advisory_data.affected_packages),


There is no certainty that this will work since we don't have a proper implementation for handling comparisons between AffectedPackage.

For example, this will not be able to normalize the list of affected_packages below:

affected_packages = [ AffectedPackage( package=PackageURL( type="alpine", namespace=None, name="linux-lts", version=None, qualifiers={ "arch": "aarch64", "distroversion": "v3.20", "reponame": "main", }, subpath=None, ), affected_version_range=None, fixed_version="6.6.13-r1", ), AffectedPackage( package=PackageURL( type="alpine", namespace=None, name="linux-lts", version=None, qualifiers={"arch": "armhf", "distroversion": "v3.21", "reponame": "main"}, subpath=None, ), affected_version_range=None, fixed_version="6.6.13-r1", ), ]

vulnerabilities/utils.py

keshav-space · 2025-01-31T16:20:32Z

vulnerabilities/utils.py

+    }
+
+    if include_metadata:
+        normalized_data["created_by"] = advisory_data.created_by


created_by is a model field, not an attribute of AdvisoryData.

Signed-off-by: Tushar Goel <[email protected]>

keshav-space

Thanks @TG1999, some nits for your consideration.

vulnerabilities/models.py

vulnerabilities/pipelines/remove_duplicate_advisories.py

keshav-space · 2025-02-07T16:06:48Z

vulnerabilities/pipelines/remove_duplicate_advisories.py

+        """
+        Find advisories with the same content and keep only the latest one.
+        """
+        # Get all advisories that have duplicates based on content ID
+        duplicate_content_ids = (
+            Advisory.objects.values("unique_content_id")
+            .annotate(count=Count("id"))
+            .filter(count__gt=1)
+            .values_list("unique_content_id", flat=True)
+        )
+
+        self.log(
+            f"Found {len(duplicate_content_ids)} content IDs with duplicates", level=logging.INFO
+        )
+
+        for content_id in duplicate_content_ids:
+            # Get all advisories with this content ID
+            advisories = Advisory.objects.filter(unique_content_id=content_id)
+
+            # Find the latest advisory
+            latest = advisories.latest("date_imported")
+
+            # Delete all except the latest
+            advisories.exclude(id=latest.id).delete()
+
+            if self.log:
+                self.log(
+                    f"Kept advisory {latest.id} and removed "
+                    f"{advisories.count() - 1} duplicates for content ID {content_id}",
+                    level=logging.INFO,
+                )
+
+    def update_content_ids(self):
+        """
+        Update content IDs for all advisories that don't have one.
+        """
+        advisories = Advisory.objects.filter(
+            Q(unique_content_id="") | Q(unique_content_id__isnull=True)
+        )
+
+        self.log(f"Found {advisories.count()} advisories without content ID", level=logging.INFO)
+
+        for advisory in advisories:
+            advisory.unique_content_id = compute_content_id(advisory)
+            advisory.save()
+
+            if self.log:
+                self.log(f"Updated content ID for advisory {advisory.id}", level=logging.DEBUG)


Maybe we can do something simpler like this.

Suggested change

"""

Find advisories with the same content and keep only the latest one.

"""

# Get all advisories that have duplicates based on content ID

duplicate_content_ids = (

Advisory.objects.values("unique_content_id")

.annotate(count=Count("id"))

.filter(count__gt=1)

.values_list("unique_content_id", flat=True)

)

self.log(

f"Found {len(duplicate_content_ids)} content IDs with duplicates", level=logging.INFO

)

for content_id in duplicate_content_ids:

# Get all advisories with this content ID

advisories = Advisory.objects.filter(unique_content_id=content_id)

# Find the latest advisory

latest = advisories.latest("date_imported")

# Delete all except the latest

advisories.exclude(id=latest.id).delete()

if self.log:

self.log(

f"Kept advisory {latest.id} and removed "

f"{advisories.count() - 1} duplicates for content ID {content_id}",

level=logging.INFO,

)

def update_content_ids(self):

"""

Update content IDs for all advisories that don't have one.

"""

advisories = Advisory.objects.filter(

Q(unique_content_id="") | Q(unique_content_id__isnull=True)

)

self.log(f"Found {advisories.count()} advisories without content ID", level=logging.INFO)

for advisory in advisories:

advisory.unique_content_id = compute_content_id(advisory)

advisory.save()

if self.log:

self.log(f"Updated content ID for advisory {advisory.id}", level=logging.DEBUG)

"""

Recompute content id and remove advisories with the same content and keep only the latest one.

"""

advisories = Advisory.objects.all().order_by("-id").paginated()

advisories_count = Advisory.objects.all().count()

self.log(f"Computing new content id for {advisories_count} and removing duplicates.")

batch_size = 10000

deleted_advisory_count = 0

updated_advisory_count = 0

duplicate_advisory_id = []

updated_advisory = []

content_ids = set()

progress = LoopProgress(

total_iterations=advisories_count,

logger=self.log,

progress_step=1,

)

for advisory in progress.iter(advisories):

content_id = compute_content_id(advisory)

if content_id in content_ids:

duplicate_advisory_id.append(advisory.id)

else:

advisory.unique_content_id = content_id

updated_advisory.append(advisory)

content_ids.add(content_id)

if len(duplicate_advisory_id) > batch_size:

deleted_advisory_count += delete_advisories(

advisory_ids=duplicate_advisory_id,

logger=self.log,

)

if len(updated_advisory) > batch_size:

updated_advisory_count += bulk_update_advisory(

items=updated_advisory,

fields=["unique_content_id"],

logger=self.log,

)

deleted_advisory_count += delete_advisories(

advisory_ids=duplicate_advisory_id,

logger=self.log,

)

updated_advisory_count += bulk_update_advisory(

items=updated_advisory,

fields=["unique_content_id"],

logger=self.log,

)

self.log(f"Removed {deleted_advisory_count} duplicates advisories.")

self.log(f"Updated content id for {deleted_advisory_count} advisories.")

def bulk_update_advisory(items, fields, logger):

item_count = 0

if items:

try:

Advisory.objects.bulk_update(objs=items, fields=fields)

item_count += len(items)

except Exception as e:

logger(f"Error updating Advisory: {e}")

items.clear()

return item_count

def delete_advisories(advisory_ids, logger):

item_count = 0

if advisory_ids:

try:

Advisory.objects.filter(id__in=advisory_ids).delete()

item_count += len(advisory_ids)

except Exception as e:

logger(f"Error deleting Advisory: {e}")

advisory_ids.clear()

return item_count

pombredanne

Here is some more feedback.

vulnerabilities/importer.py

vulnerabilities/models.py

vulnerabilities/pipelines/remove_duplicate_advisories.py

Signed-off-by: Tushar Goel <[email protected]>

keshav-space

Thanks @TG1999, some feedback.

keshav-space · 2025-02-12T16:12:36Z

vulnerabilities/pipelines/remove_duplicate_advisories.py

+        duplicated_advisories = groupby(
+            Advisory.objects.order_by("unique_content_id").all().paginated(),
+            key=lambda x: x.unique_content_id,
+        )


I honestly doubt that this will work smoothly in production, ordering 118 million advisories using unique_content_id, which is an unindexed field is not good idea.

#1766 (comment) would be much more practical approach since id is an autogenerated primary key field and hence it's already indexed, and we can use this to select the latest or oldest advisory.

db_index is in models for that! And since we believe there are lots of dupes. I believe index to work fine

We're adding this db_index now. I am not sure how long the index creation itself is going to take.

keshav-space · 2025-02-12T16:13:38Z

vulnerabilities/pipelines/remove_duplicate_advisories.py

+            advisory.unique_content_id = compute_content_id(advisory)
+            advisories.append(advisory)
+
+        Advisory.objects.bulk_update(advisories, ["unique_content_id"], batch_size=1000)


Keeping all that advisories in memory is going to be very expensive!
Assuming 1 Advisory object ≈ 1000 bytes.
That would mean for 118 million advisories: (118 * 10^6) * 10^3 bytes ≈ 118 GB of memory!.

We should not keep all these advisories in memory. Instead we should bulk update as soon as we reach the batch size.

keshav-space · 2025-02-12T16:14:18Z

vulnerabilities/utils.py

+    from vulnerabilities.models import Advisory
+
+    if isinstance(advisory_data, Advisory):
+        normalized_data = {


Content id should also include aliases field.

Let's look at this data source https://github.com/nodejs/security-wg/blob/75c78bbbd2ef86d289c16818bb487a70e315dc43/vuln/npm/7.json. This single advisory contains 2 CVEs, and for this we create 2 different advisory records. If we do not consider aliases while computing the content ID, then we will delete the one advisory in our dedupe pipeline.

Makes sense, we should do it!

Signed-off-by: Tushar Goel <[email protected]>

keshav-space requested changes Jan 31, 2025

View reviewed changes

TG1999 added 4 commits February 6, 2025 21:28

Add new content ID function

0f6ab4d

Signed-off-by: Tushar Goel <[email protected]>

Add tests and address review comments

546d2d9

Signed-off-by: Tushar Goel <[email protected]>

New content ID pipeline

296fbcf

Signed-off-by: Tushar Goel <[email protected]>

New content ID pipeline

ebf1a32

Signed-off-by: Tushar Goel <[email protected]>

TG1999 force-pushed the add_new_content_id_function branch from 8793912 to ebf1a32 Compare February 6, 2025 16:01

keshav-space requested changes Feb 7, 2025

View reviewed changes

pombredanne requested changes Feb 11, 2025

View reviewed changes

TG1999 added 4 commits February 12, 2025 19:12

Address review comments

65109a8

Signed-off-by: Tushar Goel <[email protected]>

Address review comments

393eee0

Signed-off-by: Tushar Goel <[email protected]>

Address review comments

1cf8b69

Signed-off-by: Tushar Goel <[email protected]>

Address review comments

0424750

Signed-off-by: Tushar Goel <[email protected]>

keshav-space requested changes Feb 12, 2025

View reviewed changes

TG1999 added 12 commits February 13, 2025 21:36

Address review comments

dcfc5e0

Signed-off-by: Tushar Goel <[email protected]>

Address review comments

25eea01

Signed-off-by: Tushar Goel <[email protected]>

Address review comments

60689cb

Signed-off-by: Tushar Goel <[email protected]>

Address review comments

e5b68fd

Signed-off-by: Tushar Goel <[email protected]>

Remove unique content ID from unqiue together

e936834

Signed-off-by: Tushar Goel <[email protected]>

Remove unique together from advisories

875313f

Signed-off-by: Tushar Goel <[email protected]>

Fix migrations

09d3762

Signed-off-by: Tushar Goel <[email protected]>

Fix pipeline errors

c457372

Signed-off-by: Tushar Goel <[email protected]>

Add filter for fast itreation

85a9d76

Signed-off-by: Tushar Goel <[email protected]>

Increase batch size

10eb07a

Signed-off-by: Tushar Goel <[email protected]>

Fix error

0906a3b

Signed-off-by: Tushar Goel <[email protected]>

Add logs

060af18

Signed-off-by: Tushar Goel <[email protected]>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add new content ID function #1766

Add new content ID function #1766

TG1999 commented Jan 29, 2025 •

edited

Loading

keshav-space left a comment

keshav-space Jan 31, 2025 •

edited

Loading

keshav-space Jan 31, 2025

keshav-space left a comment

keshav-space Feb 7, 2025 •

edited

Loading

pombredanne left a comment

keshav-space left a comment •

edited

Loading

keshav-space Feb 12, 2025

TG1999 Feb 12, 2025

keshav-space Feb 12, 2025

keshav-space Feb 12, 2025

keshav-space Feb 12, 2025

TG1999 Feb 12, 2025

Add new content ID function #1766

Are you sure you want to change the base?

Add new content ID function #1766

Conversation

TG1999 commented Jan 29, 2025 • edited Loading

keshav-space left a comment

Choose a reason for hiding this comment

keshav-space Jan 31, 2025 • edited Loading

Choose a reason for hiding this comment

keshav-space Jan 31, 2025

Choose a reason for hiding this comment

keshav-space left a comment

Choose a reason for hiding this comment

keshav-space Feb 7, 2025 • edited Loading

Choose a reason for hiding this comment

pombredanne left a comment

Choose a reason for hiding this comment

keshav-space left a comment • edited Loading

Choose a reason for hiding this comment

keshav-space Feb 12, 2025

Choose a reason for hiding this comment

TG1999 Feb 12, 2025

Choose a reason for hiding this comment

keshav-space Feb 12, 2025

Choose a reason for hiding this comment

keshav-space Feb 12, 2025

Choose a reason for hiding this comment

keshav-space Feb 12, 2025

Choose a reason for hiding this comment

TG1999 Feb 12, 2025

Choose a reason for hiding this comment

TG1999 commented Jan 29, 2025 •

edited

Loading

keshav-space Jan 31, 2025 •

edited

Loading

keshav-space Feb 7, 2025 •

edited

Loading

keshav-space left a comment •

edited

Loading