From df838819d7680e416a7f7cf874842d90a88fca05 Mon Sep 17 00:00:00 2001 From: Chineta Adinnu Date: Thu, 16 Jan 2025 12:07:27 +0100 Subject: [PATCH 1/4] add searchvector to commit table and index using GIN --- treeherder/config/settings.py | 1 + .../0036_commit_search_vector_idx.py | 24 +++++++++++++++++++ treeherder/model/models.py | 12 ++++++---- treeherder/webapp/api/push.py | 13 +++++++--- 4 files changed, 43 insertions(+), 7 deletions(-) create mode 100644 treeherder/model/migrations/0036_commit_search_vector_idx.py diff --git a/treeherder/config/settings.py b/treeherder/config/settings.py index e59321d31a4..14b4bb1523c 100644 --- a/treeherder/config/settings.py +++ b/treeherder/config/settings.py @@ -67,6 +67,7 @@ INSTALLED_APPS = [ "django.contrib.auth", "django.contrib.contenttypes", + "django.contrib.postgres.search", # Disable Django's own staticfiles handling in favour of WhiteNoise, for # greater consistency between gunicorn and `./manage.py runserver`. "whitenoise.runserver_nostatic", diff --git a/treeherder/model/migrations/0036_commit_search_vector_idx.py b/treeherder/model/migrations/0036_commit_search_vector_idx.py new file mode 100644 index 00000000000..cd711687e3a --- /dev/null +++ b/treeherder/model/migrations/0036_commit_search_vector_idx.py @@ -0,0 +1,24 @@ +# Generated by Django 5.1.2 on 2025-01-24 07:42 + +import django.contrib.postgres.indexes +import django.contrib.postgres.search +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("model", "0035_bugscache_optional_bugzilla_ref"), + ] + + operations = [ + migrations.AddIndex( + model_name="commit", + index=django.contrib.postgres.indexes.GinIndex( + django.contrib.postgres.search.SearchVector( + "revision", "author", "comments", config="english" + ), + name="search_vector_idx", + ), + ), + ] diff --git a/treeherder/model/models.py b/treeherder/model/models.py index eaeb236163f..c6fd69a9ac3 100644 --- a/treeherder/model/models.py +++ b/treeherder/model/models.py @@ -7,12 +7,13 @@ import newrelic.agent from django.contrib.auth.models import User -from django.contrib.postgres.search import TrigramSimilarity +from django.contrib.postgres.indexes import GinIndex from django.core.cache import cache from django.core.exceptions import ObjectDoesNotExist from django.core.validators import MinLengthValidator from django.db import models, transaction from django.db.models import Count, Max, Min, Q, Subquery +from django.contrib.postgres.search import TrigramSimilarity, SearchVector from django.db.utils import ProgrammingError from django.forms import model_to_dict from django.utils import timezone @@ -188,11 +189,14 @@ class Commit(models.Model): class Meta: db_table = "commit" unique_together = ("push", "revision") - + indexes = [ + GinIndex( + SearchVector("revision", "author", "comments", config="english"), + name="search_vector_idx", + ), + ] def __str__(self): return f"{self.push.repository.name} {self.revision}" - - class MachinePlatform(models.Model): id = models.AutoField(primary_key=True) os_name = models.CharField(max_length=25) diff --git a/treeherder/webapp/api/push.py b/treeherder/webapp/api/push.py index 2a2eca11ff0..3d8d326e301 100644 --- a/treeherder/webapp/api/push.py +++ b/treeherder/webapp/api/push.py @@ -9,7 +9,7 @@ from rest_framework.status import HTTP_400_BAD_REQUEST, HTTP_404_NOT_FOUND from treeherder.log_parser.failureline import get_group_results -from treeherder.model.models import Job, JobType, Push, Repository +from treeherder.model.models import Job, JobType, Push, Repository, Commit from treeherder.push_health.builds import get_build_failures from treeherder.push_health.compare import get_commit_history from treeherder.push_health.linting import get_lint_failures @@ -22,6 +22,7 @@ from treeherder.webapp.api.serializers import PushSerializer from treeherder.webapp.api.utils import to_datetime, to_timestamp +from django.contrib.postgres.search import SearchVector, SearchQuery logger = logging.getLogger(__name__) @@ -42,7 +43,6 @@ def list(self, request, project): # This will contain some meta data about the request and results meta = {} - # support ranges for date as well as revisions(changes) like old tbpl for param in [ "fromchange", @@ -60,7 +60,6 @@ def list(self, request, project): all_repos = request.query_params.get("all_repos") pushes = Push.objects.order_by("-time") - if not all_repos: try: repository = Repository.objects.get(name=project) @@ -71,6 +70,14 @@ def list(self, request, project): pushes = pushes.filter(repository=repository) + search_param = filter_params.get("search") + if search_param: + filtered_commits = Commit.objects.annotate( + search=SearchVector("revision", "author", "comments", config="english") + ).filter( + search=SearchQuery(search_param, config="english") + ).values_list("push_id", flat=True) + pushes = pushes.filter(id__in=filtered_commits) for param, value in meta.items(): if param == "fromchange": revision_field = "revision__startswith" if len(value) < 40 else "revision" From f6af36a7022b587273b073dc60a74217fb321ff1 Mon Sep 17 00:00:00 2001 From: Chineta Adinnu Date: Fri, 24 Jan 2025 14:09:03 +0100 Subject: [PATCH 2/4] add ordering by push time and set result limit to 200 --- treeherder/model/models.py | 5 ++++- treeherder/webapp/api/push.py | 24 +++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/treeherder/model/models.py b/treeherder/model/models.py index c6fd69a9ac3..5c56713fa06 100644 --- a/treeherder/model/models.py +++ b/treeherder/model/models.py @@ -8,12 +8,12 @@ import newrelic.agent from django.contrib.auth.models import User from django.contrib.postgres.indexes import GinIndex +from django.contrib.postgres.search import SearchVector, TrigramSimilarity from django.core.cache import cache from django.core.exceptions import ObjectDoesNotExist from django.core.validators import MinLengthValidator from django.db import models, transaction from django.db.models import Count, Max, Min, Q, Subquery -from django.contrib.postgres.search import TrigramSimilarity, SearchVector from django.db.utils import ProgrammingError from django.forms import model_to_dict from django.utils import timezone @@ -195,8 +195,11 @@ class Meta: name="search_vector_idx", ), ] + def __str__(self): return f"{self.push.repository.name} {self.revision}" + + class MachinePlatform(models.Model): id = models.AutoField(primary_key=True) os_name = models.CharField(max_length=25) diff --git a/treeherder/webapp/api/push.py b/treeherder/webapp/api/push.py index 3d8d326e301..a4db54bebc3 100644 --- a/treeherder/webapp/api/push.py +++ b/treeherder/webapp/api/push.py @@ -3,13 +3,14 @@ import newrelic.agent from cache_memoize import cache_memoize +from django.contrib.postgres.search import SearchQuery, SearchVector from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response from rest_framework.status import HTTP_400_BAD_REQUEST, HTTP_404_NOT_FOUND from treeherder.log_parser.failureline import get_group_results -from treeherder.model.models import Job, JobType, Push, Repository, Commit +from treeherder.model.models import Commit, Job, JobType, Push, Repository from treeherder.push_health.builds import get_build_failures from treeherder.push_health.compare import get_commit_history from treeherder.push_health.linting import get_lint_failures @@ -22,7 +23,6 @@ from treeherder.webapp.api.serializers import PushSerializer from treeherder.webapp.api.utils import to_datetime, to_timestamp -from django.contrib.postgres.search import SearchVector, SearchQuery logger = logging.getLogger(__name__) @@ -71,13 +71,19 @@ def list(self, request, project): pushes = pushes.filter(repository=repository) search_param = filter_params.get("search") - if search_param: - filtered_commits = Commit.objects.annotate( - search=SearchVector("revision", "author", "comments", config="english") - ).filter( - search=SearchQuery(search_param, config="english") - ).values_list("push_id", flat=True) - pushes = pushes.filter(id__in=filtered_commits) + if search_param: + filtered_commits = ( + Commit.objects.annotate( + search=SearchVector("revision", "author", "comments", config="english") + ) + .filter( + search=SearchQuery(search_param, config="english") + # Get most recent results and limit result to 200 + ) + .order_by("-push__time")[:200] + .values_list("push_id", flat=True) + ) + pushes = pushes.filter(id__in=filtered_commits) for param, value in meta.items(): if param == "fromchange": revision_field = "revision__startswith" if len(value) < 40 else "revision" From cdb2b614e1ac7514aad1d323c4ba1aee149f08aa Mon Sep 17 00:00:00 2001 From: Chineta Adinnu Date: Tue, 28 Jan 2025 16:16:53 +0100 Subject: [PATCH 3/4] add test for new search functionality --- tests/webapp/api/test_push_api.py | 89 ++++++++++++++++++++++++++++++- treeherder/webapp/api/push.py | 10 ++-- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/tests/webapp/api/test_push_api.py b/tests/webapp/api/test_push_api.py index 5b82f0b8b76..1391b49e523 100644 --- a/tests/webapp/api/test_push_api.py +++ b/tests/webapp/api/test_push_api.py @@ -5,7 +5,7 @@ from tests.conftest import IS_WINDOWS from treeherder.etl.push import store_push_data -from treeherder.model.models import FailureClassification, JobNote, Push +from treeherder.model.models import Commit, FailureClassification, JobNote, Push from treeherder.webapp.api import utils @@ -365,6 +365,93 @@ def test_push_author_contains(client, test_repository): assert results[0]["id"] == 3 +def test_push_search(client, test_repository): + """ + Test the search parameter for filtering by Commit fields: revision, author, comments. + """ + now = datetime.datetime.now() + push1 = Push.objects.create( + repository=test_repository, + revision="1234abcd", + author="foo@bar.com", + time=now, + ) + push2 = Push.objects.create( + repository=test_repository, + revision="2234abcd", + author="foo2@bar.com", + time=now + datetime.timedelta(seconds=1), + ) + push3 = Push.objects.create( + repository=test_repository, + revision="3234abcd", + author="qux@bar.com", + time=now + datetime.timedelta(seconds=2), + ) + + # Add Commit objects linked to the Push objects + Commit.objects.create( + push=push1, revision="1234abcd", author="kaz ", comments="Initial commit" + ) + Commit.objects.create( + push=push2, revision="2234abcd", author="foo ", comments="Bug 12345567 - fix" + ) + Commit.objects.create( + push=push3, + revision="3234abcd", + author="quxzan .com", + comments="Bug 12345567 - Feature added", + ) + + # Test search by comments + resp = client.get( + reverse("push-list", kwargs={"project": test_repository.name}) + "?search=bug" + ) + assert resp.status_code == 200 + + results = resp.json()["results"] + assert len(results) == 2 + assert set([result["id"] for result in results]) == set([3, 2]) + + # Test search by bug number + resp = client.get( + reverse("push-list", kwargs={"project": test_repository.name}) + "?search=12345567" + ) + assert resp.status_code == 200 + + results = resp.json()["results"] + assert len(results) == 2 + assert set([result["id"] for result in results]) == set([3, 2]) + + # Test search by author + resp = client.get( + reverse("push-list", kwargs={"project": test_repository.name}) + "?search=foo" + ) + assert resp.status_code == 200 + + results = resp.json()["results"] + assert len(results) == 1 + assert results[0]["id"] == push2.id + + # Test search by revision + resp = client.get( + reverse("push-list", kwargs={"project": test_repository.name}) + "?search=3234abcd" + ) + assert resp.status_code == 200 + + results = resp.json()["results"] + assert len(results) == 1 + assert results[0]["id"] == push3.id + + # Test empty search input + resp = client.get(reverse("push-list", kwargs={"project": test_repository.name}) + "?search=") + assert resp.status_code == 200 + + results = resp.json()["results"] + assert len(results) == 3 + assert set([result["id"] for result in results]) == set([3, 2, 1]) + + def test_push_reviewbot(client, test_repository): """ test the reviewbot parameter diff --git a/treeherder/webapp/api/push.py b/treeherder/webapp/api/push.py index a4db54bebc3..8b9a38c4c07 100644 --- a/treeherder/webapp/api/push.py +++ b/treeherder/webapp/api/push.py @@ -1,5 +1,6 @@ import datetime import logging +from datetime import date, timedelta import newrelic.agent from cache_memoize import cache_memoize @@ -77,11 +78,14 @@ def list(self, request, project): search=SearchVector("revision", "author", "comments", config="english") ) .filter( - search=SearchQuery(search_param, config="english") - # Get most recent results and limit result to 200 + search=SearchQuery(search_param, config="english"), + # Filter and retrieve pushes from last 3 months + push__time__gte=date.today() - timedelta(days=4 * 7 * 3), ) - .order_by("-push__time")[:200] .values_list("push_id", flat=True) + # Get most recent results and limit result to 200 + .order_by("-push__time") + .distinct()[:200] ) pushes = pushes.filter(id__in=filtered_commits) for param, value in meta.items(): From 1bacaa45999a16dceb6da0c301dd120e0132ff3e Mon Sep 17 00:00:00 2001 From: Chineta Adinnu Date: Wed, 29 Jan 2025 17:54:44 +0100 Subject: [PATCH 4/4] handle duplicate revisions from different projects --- treeherder/webapp/api/push.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/treeherder/webapp/api/push.py b/treeherder/webapp/api/push.py index 8b9a38c4c07..708a4283457 100644 --- a/treeherder/webapp/api/push.py +++ b/treeherder/webapp/api/push.py @@ -1,6 +1,5 @@ import datetime import logging -from datetime import date, timedelta import newrelic.agent from cache_memoize import cache_memoize @@ -73,14 +72,14 @@ def list(self, request, project): search_param = filter_params.get("search") if search_param: + repository = Repository.objects.get(name=project) filtered_commits = ( Commit.objects.annotate( search=SearchVector("revision", "author", "comments", config="english") ) .filter( search=SearchQuery(search_param, config="english"), - # Filter and retrieve pushes from last 3 months - push__time__gte=date.today() - timedelta(days=4 * 7 * 3), + push__repository=repository, ) .values_list("push_id", flat=True) # Get most recent results and limit result to 200