Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index snippets #574

Merged
merged 8 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions matchcode/migrations/0005_stemmedsnippetindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Generated by Django 5.1.5 on 2025-02-21 01:12

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("matchcode", "0004_snippetindex_position_and_more"),
("packagedb", "0090_alter_packageactivity_uuid"),
]

operations = [
migrations.CreateModel(
name="StemmedSnippetIndex",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"fingerprint",
models.BinaryField(
db_index=True,
help_text="Binary form of a snippet fingerprint",
max_length=16,
),
),
("position", models.PositiveIntegerField(default=0)),
(
"package",
models.ForeignKey(
help_text="The Package that this file is from",
on_delete=django.db.models.deletion.CASCADE,
to="packagedb.package",
),
),
(
"resource",
models.ForeignKey(
help_text="The Package that this snippet fingerprint is from",
on_delete=django.db.models.deletion.CASCADE,
to="packagedb.resource",
),
),
],
options={
"abstract": False,
},
),
]
22 changes: 16 additions & 6 deletions matchcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import attr
from licensedcode.spans import Span
from matchcode_toolkit.fingerprinting import SNIPPET_WINDOW_LENGTH
from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode_toolkit.fingerprinting import split_fingerprint
Expand Down Expand Up @@ -403,7 +404,7 @@ class ApproximateResourceContentIndex(ApproximateMatchingHashMixin):

class PackageSnippetMatch(NamedTuple):
package: Package
fingerprints: list["SnippetIndex"]
fingerprints: list
fingerprints_count: int


Expand All @@ -425,7 +426,7 @@ def to_dict(self):
}


class SnippetIndex(PackageRelatedMixin, models.Model):
class BaseSnippetIndexMixin(PackageRelatedMixin, models.Model):
resource = models.ForeignKey(
Resource,
help_text="The Package that this snippet fingerprint is from",
Expand All @@ -447,7 +448,8 @@ class SnippetIndex(PackageRelatedMixin, models.Model):
default=0,
)

# TODO: window length must be constant so we can calculate offsets
class Meta:
abstract = True

@classmethod
def index(cls, fingerprint, position, resource, package):
Expand Down Expand Up @@ -548,8 +550,8 @@ def match_resources(cls, fingerprints, top=None, **kwargs):
extended_file_fragment_matches_by_fingerprints = defaultdict(list)
for fp in fingerprints:
snippet = fp["snippet"]
start_pos = fp["start_pos"]
end_pos = fp["end_pos"]
start_pos = fp["position"]
end_pos = start_pos + SNIPPET_WINDOW_LENGTH - 1
resource = kwargs.get("resource")
qspan = Span(start_pos, end_pos)
extended_file_fragment_matches_by_fingerprints[snippet].append(
Expand All @@ -574,7 +576,7 @@ def match_resources(cls, fingerprints, top=None, **kwargs):
matches = []
for r in resources:
# Get unique snippet fingerprints for this Resource
r_snippets = SnippetIndex.objects.filter(resource=r).distinct("fingerprint")
r_snippets = cls.objects.filter(resource=r).distinct("fingerprint")
matching_snippets = r_snippets.filter(fingerprint__in=only_fings)
r_snippets_count = r_snippets.count()
matching_snippets_count = matching_snippets.count()
Expand Down Expand Up @@ -658,6 +660,14 @@ def sorter(m):
return final_matches[:top]


class SnippetIndex(BaseSnippetIndexMixin, models.Model):
pass


class StemmedSnippetIndex(BaseSnippetIndexMixin, models.Model):
pass


class ApproximateFileIndex(ApproximateMatchingHashMixin, models.Model):
pass

Expand Down
10 changes: 5 additions & 5 deletions matchcode/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ def setUp(self):
self.test_resource1_snippets = fingerprints["snippets"]
for snippet in self.test_resource1_snippets:
fingerprint = snippet["snippet"]
position = snippet["start_pos"]
position = snippet["position"]
SnippetIndex.index(
fingerprint,
position,
Expand All @@ -513,7 +513,7 @@ def setUp(self):
self.test_resource2_snippets = fingerprints2["snippets"]
for snippet in self.test_resource2_snippets:
fingerprint = snippet["snippet"]
position = snippet["start_pos"]
position = snippet["position"]
SnippetIndex.index(
fingerprint,
position,
Expand Down Expand Up @@ -545,7 +545,7 @@ def setUp(self):
self.test_resource3_snippets = fingerprints3["snippets"]
for snippet in self.test_resource3_snippets:
fingerprint = snippet["snippet"]
position = snippet["start_pos"]
position = snippet["position"]
SnippetIndex.index(
fingerprint,
position,
Expand All @@ -569,7 +569,7 @@ def setUp(self):
self.test_resource4_snippets = fingerprints4["snippets"]
for snippet in self.test_resource4_snippets:
fingerprint = snippet["snippet"]
position = snippet["start_pos"]
position = snippet["position"]
SnippetIndex.index(
fingerprint,
position,
Expand All @@ -593,7 +593,7 @@ def setUp(self):
self.test_resource5_snippets = fingerprints5["snippets"]
for snippet in self.test_resource5_snippets:
fingerprint = snippet["snippet"]
position = snippet["start_pos"]
position = snippet["position"]
SnippetIndex.index(
fingerprint,
position,
Expand Down
7 changes: 7 additions & 0 deletions matchcode_pipeline/pipelines/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ def match_resources_snippets(self):
logger=self.log,
)

def match_resources_stemmed_snippets(self):
"""Match snippets from selected text files in PurlDB"""
matching.match_purldb_resources_stemmed_snippets(
project=self.project,
logger=self.log,
)

def match_purldb_resources_post_process(self):
"""Choose the best package for PurlDB matched resources."""
matching.match_purldb_resources_post_process(self.project, logger=self.log)
Expand Down
58 changes: 58 additions & 0 deletions matchcode_pipeline/pipes/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from matchcode.models import ApproximateDirectoryContentIndex
from matchcode.models import ApproximateResourceContentIndex
from matchcode.models import SnippetIndex
from matchcode.models import StemmedSnippetIndex
from packagedb.models import Package
from packagedb.models import Resource

Expand Down Expand Up @@ -212,6 +213,30 @@ def match_purldb_resource_snippets(project, resource):
save_resource_fingerprints(resource, {"matched_snippets": matched_snippets})


def match_purldb_resource_stemmed_snippets(project, resource):
"""Match by approximation a single resource in the PurlDB."""
fingerprints = resource.extra_data.get("snippets", "")
results = StemmedSnippetIndex.match_resources(
fingerprints=fingerprints,
resource=resource,
)
if results:
matched_stemmed_snippets = []
for result in results:
matched_package_data = result.package.to_dict()
create_package_from_purldb_data(
project,
[resource],
matched_package_data,
"snippet-matched-to-purldb-resource",
)
results_mapping = result.to_dict()
matched_stemmed_snippets.append(results_mapping)
save_resource_fingerprints(
resource, {"matched_stemmed_snippets": matched_stemmed_snippets}
)


def match_purldb_directory(project, resource, exact_match=False):
"""Match a single directory resource in the PurlDB."""
fingerprint = resource.extra_data.get("directory_content", "")
Expand Down Expand Up @@ -390,6 +415,39 @@ def match_purldb_resources_snippets(project, logger=None):
)


def match_purldb_resources_stemmed_snippets(project, logger=None):
# Get table of resources to match on
resources = (
project.codebaseresources.filter(is_text=True)
.no_status(status=flag.MATCHED_TO_PURLDB_PACKAGE)
.no_status(status=flag.MATCHED_TO_PURLDB_RESOURCE)
.no_status(status=flag.MATCHED_TO_PURLDB_DIRECTORY)
.no_status(status=flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE)
.no_status(status="snippet-matched-to-purldb-resource")
)
resource_count = resources.count()

if logger:
logger(
f"Stemmed snippet matching {resource_count:,d} "
f"resource{pluralize(resource_count, 's')} against PurlDB"
)

resource_iterator = resources.iterator(chunk_size=2000)
progress = LoopProgress(resource_count, logger)

for resource in progress.iter(resource_iterator):
match_purldb_resource_stemmed_snippets(project, resource)

matched_count = project.codebaseresources.filter(
status="stemmed-snippet-matched-to-purldb-resource"
).count()
logger(
f"{matched_count:,d} resource{pluralize(matched_count, 's')} "
f"stemmed snippet matched in PurlDB"
)


def match_purldb_directories(project, exact_directory_match=False, logger=None):
"""Match directory CodebaseResources from `project` against the PurlDB."""
# If we are able to get match results for a directory fingerprint, then that
Expand Down
16 changes: 15 additions & 1 deletion minecode/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from matchcode.models import ApproximateResourceContentIndex
from matchcode.models import ExactFileIndex
from matchcode.models import SnippetIndex
from matchcode.models import StemmedSnippetIndex
from minecode.management.commands import get_error_message
from minecode.model_utils import update_or_create_resource
from minecode.models import ScannableURI
Expand All @@ -39,7 +40,8 @@ def index_package_files(package, scan_data, reindex=False):
package.approximatedirectorystructureindex_set.all().delete()
package.approximateresourcecontentindex_set.all().delete()
package.exactfileindex_set.all().delete()
package.snippetindex.all().delete()
package.snippetindex_set.all().delete()
package.stemmedsnippetindex_set.all().delete()
package.resources.all().delete()

scan_index_errors = []
Expand Down Expand Up @@ -97,6 +99,18 @@ def index_package_files(package, scan_data, reindex=False):
package=package,
)

stemmed_snippets = resource_extra_data.get("stemmed_snippets", [])
if stemmed_snippets:
for s in stemmed_snippets:
snippet = s["snippet"]
position = s["position"]
_, _ = StemmedSnippetIndex.index(
fingerprint=snippet,
position=position,
resource=r,
package=package,
)

except Exception as e:
msg = get_error_message(e)
scan_index_errors.append(msg)
Expand Down
24 changes: 24 additions & 0 deletions packagedb/migrations/0090_alter_packageactivity_uuid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 5.1.5 on 2025-02-21 01:12

import uuid
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("packagedb", "0089_packageactivity"),
]

operations = [
migrations.AlterField(
model_name="packageactivity",
name="uuid",
field=models.UUIDField(
default=uuid.uuid4,
editable=False,
help_text="The identifier of the package activity",
unique=True,
),
),
]
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
aboutcode-toolkit==11.0.0
aboutcode.federatedcode==0.1.0
aboutcode.hashid==0.2.0
aboutcode.pipeline==0.2.0
aboutcode.pipeline==0.2.1
arrow==1.3.0
asgiref==3.8.1
attrs==25.1.0
Expand Down Expand Up @@ -79,7 +79,7 @@ lxml==5.3.1
Markdown==3.7
markdown-it-py==3.0.0
MarkupSafe==3.0.2
matchcode-toolkit @ git+https://github.com/aboutcode-org/matchcode-toolkit.git@007ca57509ab73dda391af8631616b59e35eb9ef
matchcode-toolkit==7.2.1
mdurl==0.1.2
milksnake==0.1.6
mock==5.1.0
Expand Down Expand Up @@ -136,7 +136,7 @@ rust-inspector==0.1.0
samecode==0.5.1
saneyaml==0.6.1
scancode-toolkit==32.3.2
scancodeio @ git+https://github.com/aboutcode-org/scancode.io.git@ca654094e324f22d7d1b7024b546d2f90591698b
scancodeio @ git+https://github.com/aboutcode-org/scancode.io.git@b7bc7c9efc6d995967b70eb83695271bd1edc3b9
semantic-version==2.10.0
semver==3.0.4
setuptools==75.6.0
Expand Down
6 changes: 3 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ packages = find:
include_package_data = true
zip_safe = false
install_requires =
aboutcode.pipeline >= 0.2.0
aboutcode.pipeline >= 0.2.1
arrow >= 1.3.0
bitarray >= 2.9.2
debian-inspector >= 31.1.0
Expand All @@ -60,10 +60,10 @@ install_requires =
rubymarshal == 1.0.3
scancode-toolkit[packages] >= 32.3.0
urlpy >= 0.5
matchcode-toolkit @ git+https://github.com/aboutcode-org/matchcode-toolkit.git@007ca57509ab73dda391af8631616b59e35eb9ef
matchcode-toolkit >= 7.2.1
purl2vcs >= 2.0.0
univers >= 30.12.1
scancodeio @ git+https://github.com/aboutcode-org/scancode.io.git@ca654094e324f22d7d1b7024b546d2f90591698b
scancodeio @ git+https://github.com/aboutcode-org/scancode.io.git@b7bc7c9efc6d995967b70eb83695271bd1edc3b9
gitpython >= 3.1.43
samecode >= 0.5.1
# FederatedCode integration
Expand Down