Skip to content

Commit

Permalink
Merge pull request #2142 from dandi/unaccent-search
Browse files Browse the repository at this point in the history
Use `Unaccent` with dandiset search filter
  • Loading branch information
jjnesbitt authored Jan 28, 2025
2 parents 9a1a8ae + e4013f1 commit 1290ea6
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 41 deletions.
15 changes: 15 additions & 0 deletions dandiapi/api/migrations/0015_unaccent_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Generated by Django 4.2.17 on 2025-01-17 17:45
from __future__ import annotations

from django.contrib.postgres.operations import UnaccentExtension
from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
('api', '0014_garbagecollectionevent_garbagecollectioneventrecord'),
]

operations = [
UnaccentExtension(),
]
32 changes: 32 additions & 0 deletions dandiapi/api/tests/test_dandiset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,38 @@ def test_dandiset_rest_search_identifier(api_client, draft_version):
assert results[0]['draft_version']['name'] == draft_version.name


@pytest.mark.django_db
def test_dandiset_rest_search_accented_characters(api_client, draft_version_factory):
dv = draft_version_factory()
dv.metadata['contributor'][0]['name'] = 'Buzsáki, György'
dv.save()

assert (
api_client.get('/api/dandisets/', {'search': 'György'}).data['results']
== api_client.get('/api/dandisets/', {'search': 'Gyorgy'}).data['results']
)
assert (
api_client.get('/api/dandisets/', {'search': 'Buzsáki'}).data['results']
== api_client.get('/api/dandisets/', {'search': 'Buzsaki'}).data['results']
)


@pytest.mark.django_db
def test_dandiset_rest_search_many_versions(
api_client, draft_version_factory, published_version_factory, dandiset
):
draft_version = draft_version_factory(dandiset=dandiset)
draft_version.metadata['contributor'][0]['name'] = 'testname'
draft_version.save()

published_version = published_version_factory(dandiset=dandiset)
published_version.metadata['contributor'][0]['name'] = 'testname'
published_version.save()

results = api_client.get('/api/dandisets/', {'search': 'testname'}).data['results']
assert len(results) == 1


@pytest.mark.django_db
@pytest.mark.parametrize(
'contributors',
Expand Down
114 changes: 73 additions & 41 deletions dandiapi/api/views/dandiset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@

from allauth.socialaccount.models import SocialAccount
from django.contrib.auth.models import User
from django.contrib.postgres.lookups import Unaccent
from django.db import transaction
from django.db.models import Count, Max, OuterRef, QuerySet, Subquery, Sum
from django.db.models.functions import Coalesce
from django.db.models import Count, Max, OuterRef, QuerySet, Subquery, Sum, TextField
from django.db.models.functions import Cast, Coalesce
from django.db.models.query_utils import Q
from django.http import Http404
from drf_yasg.utils import no_body, swagger_auto_schema
Expand All @@ -17,6 +18,7 @@
from rest_framework.generics import get_object_or_404
from rest_framework.response import Response
from rest_framework.serializers import ValidationError
from rest_framework.settings import api_settings as drf_settings
from rest_framework.viewsets import ReadOnlyModelViewSet

from dandiapi.api.asset_paths import get_root_paths_many
Expand Down Expand Up @@ -56,11 +58,12 @@

if TYPE_CHECKING:
from rest_framework.request import Request
from rest_framework.views import APIView

from dandiapi.api.models.upload import Upload


class DandisetFilterBackend(filters.OrderingFilter):
class DandisetOrderingFilter(filters.OrderingFilter):
ordering_fields = ['id', 'name', 'modified', 'size']
ordering_description = (
'Which field to use when ordering the results. '
Expand All @@ -69,51 +72,80 @@ class DandisetFilterBackend(filters.OrderingFilter):

def filter_queryset(self, request, queryset, view):
orderings = self.get_ordering(request, queryset, view)
if orderings:
ordering = orderings[0]
# ordering can be either 'created' or '-created', so test for both
if ordering.endswith('id'):
return queryset.order_by(ordering)
if ordering.endswith('name'):
# name refers to the name of the most recent version, so a subquery is required
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by(
'-created'
)[:1]
queryset = queryset.annotate(name=Subquery(latest_version.values('metadata__name')))
return queryset.order_by(ordering)
if ordering.endswith('modified'):
# modified refers to the modification timestamp of the most
# recent version, so a subquery is required
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by(
'-created'
)[:1]
# get the `modified` field of the most recent version.
# '_version' is appended because the Dandiset model already has a `modified` field
queryset = queryset.annotate(
modified_version=Subquery(latest_version.values('modified'))
)
return queryset.order_by(f'{ordering}_version')
if ordering.endswith('size'):
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by(
'-created'
)[:1]
queryset = queryset.annotate(
size=Subquery(
latest_version.annotate(
size=Coalesce(Sum('assets__blob__size'), 0)
+ Coalesce(Sum('assets__zarr__size'), 0)
).values('size')
)
if not orderings:
return queryset
ordering = orderings[0]

# ordering can be either 'created' or '-created', so test for both
if ordering.endswith('id'):
return queryset.order_by(ordering)

if ordering.endswith('name'):
# name refers to the name of the most recent version, so a subquery is required
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[
:1
]
queryset = queryset.annotate(name=Subquery(latest_version.values('metadata__name')))
return queryset.order_by(ordering)

if ordering.endswith('modified'):
# modified refers to the modification timestamp of the most
# recent version, so a subquery is required
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[
:1
]
# get the `modified` field of the most recent version.
# '_version' is appended because the Dandiset model already has a `modified` field
queryset = queryset.annotate(
modified_version=Subquery(latest_version.values('modified'))
)
return queryset.order_by(f'{ordering}_version')

if ordering.endswith('size'):
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[
:1
]
queryset = queryset.annotate(
size=Subquery(
latest_version.annotate(
size=Coalesce(Sum('assets__blob__size'), 0)
+ Coalesce(Sum('assets__zarr__size'), 0)
).values('size')
)
return queryset.order_by(ordering)
)
return queryset.order_by(ordering)

return queryset


class DandisetSearchFilter(filters.BaseFilterBackend):
search_param = drf_settings.SEARCH_PARAM

def get_search_term(self, request):
param = request.query_params.get(self.search_param, '')
return param.replace('\x00', '') # strip null characters

def filter_queryset(self, request: Request, queryset: QuerySet, view: APIView) -> QuerySet:
search_term = self.get_search_term(request=request)
if not search_term:
return queryset

# We must formulate the filter using a separate query first, as otherwise
# the generated SQL is incompatible previously generated clauses
matching_dandiset_ids = (
Version.objects.alias(search_field=Unaccent(Cast('metadata', TextField())))
.filter(search_field__icontains=search_term)
.values_list('dandiset_id', flat=True)
.distinct()
)

return queryset.filter(id__in=matching_dandiset_ids)


class DandisetViewSet(ReadOnlyModelViewSet):
serializer_class = DandisetDetailSerializer
pagination_class = DandiPagination
filter_backends = [filters.SearchFilter, DandisetFilterBackend]
search_fields = ['versions__metadata']
filter_backends = [DandisetSearchFilter, DandisetOrderingFilter]

lookup_value_regex = Dandiset.IDENTIFIER_REGEX
# This is to maintain consistency with the auto-generated names shown in swagger.
Expand Down

0 comments on commit 1290ea6

Please sign in to comment.