Skip to content

Commit

Permalink
move spam check into save method
Browse files Browse the repository at this point in the history
  • Loading branch information
John Tordoff committed Feb 9, 2024
1 parent 82ee230 commit e577e5d
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 54 deletions.
2 changes: 1 addition & 1 deletion osf/models/spam.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def do_check_spam(self, author, author_email, content, request_headers):
return

request_kwargs = {
'remote_addr': request_headers.get('Remote-Addr') or request_headers['Host'], # for local testing
'remote_addr': request_headers.get('Remote-Addr') or request_headers.get('Host'), # for local testing
'user_agent': request_headers.get('User-Agent'),
'referer': request_headers.get('Referer'),
}
Expand Down
80 changes: 52 additions & 28 deletions osf/models/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
from website.project import new_bookmark_collection
from website.util.metrics import OsfSourceTags
from importlib import import_module
from osf.utils.requests import get_headers_from_request

SessionStore = import_module(settings.SESSION_ENGINE).SessionStore

Expand Down Expand Up @@ -1025,8 +1026,14 @@ def save(self, *args, **kwargs):

self.update_is_active()
self.username = self.username.lower().strip() if self.username else None
dirty_fields = set(self.get_dirty_fields(check_relationship=True))
ret = super(OSFUser, self).save(*args, **kwargs)
dirty_fields = self.get_dirty_fields(check_relationship=True)
ret = super(OSFUser, self).save(*args, **kwargs) # must save BEFORE spam check, as user needs guid.
if set(self.SPAM_USER_PROFILE_FIELDS.keys()).intersection(dirty_fields):
request = get_current_request()
headers = get_headers_from_request(request)
self.check_spam(dirty_fields, request_headers=headers)

dirty_fields = set(dirty_fields)
if self.SEARCH_UPDATE_FIELDS.intersection(dirty_fields) and self.is_confirmed:
self.update_search()
self.update_search_nodes_contributors()
Expand Down Expand Up @@ -1859,28 +1866,38 @@ def get_node_comment_timestamps(self, target_id):
return self.comments_viewed_timestamp.get(target_id, default_timestamp)

def _get_spam_content(self, saved_fields=None, **unused_kwargs):
spam_check_fields = set(self.SPAM_USER_PROFILE_FIELDS.keys())
spam_check_source = {}
if saved_fields:
spam_check_source = saved_fields
spam_check_fields = spam_check_fields.intersection(set(saved_fields.keys()))
else:
spam_check_source = {field: getattr(self, field) for field in spam_check_fields}
"""
Retrieves content for spam checking from specified fields.
Sometimes from validated serializer data, sometimes from
dirty_fields.
spam_check_contents = []
for spam_field in spam_check_fields:
spam_field_content = spam_check_source[spam_field]
if not spam_field_content:
continue
if spam_field in ['schools', 'jobs']:
spam_check_contents.extend(
_get_nested_spam_check_content(spam_check_source, spam_field)
)
else: # Only other currently checked field is social['profileWebsites']
spam_check_contents.extend(
spam_check_source.get('social', dict()).get('profileWebsites', list())
)
return ' '.join(spam_check_contents).strip()
Parameters:
- saved_fields (dict): Fields that have been saved and their values.
- unused_kwargs: Ignored additional keyword arguments.
Returns:
- str: A string containing the spam check contents, joined by spaces.
"""
# Determine which fields to check for spam, preferring saved_fields if provided.
spam_check_fields = set(self.SPAM_USER_PROFILE_FIELDS)
spam_check_source = saved_fields if saved_fields else {field: getattr(self, field) for field in
spam_check_fields}

# Ensure we only consider relevant fields present in the spam_check_source.
spam_check_fields = spam_check_fields.intersection(spam_check_source.keys())

spam_contents = []
for field in spam_check_fields:
field_value = spam_check_source.get(field)
# Validated fields have field_value values, but saved fields are only field names.
if field_value:
spam_contents.extend(_get_nested_spam_check_content(spam_check_source, field))
else:
value = getattr(self, field, {})
spam_contents.extend(_get_nested_spam_check_content(value, field))

# Join all collected spam check contents into a single string.
return ' '.join(spam_contents).strip()

def check_spam(self, saved_fields, request_headers):
is_spam = False
Expand Down Expand Up @@ -2101,11 +2118,18 @@ def create_bookmark_collection(sender, instance, created, **kwargs):


def _get_nested_spam_check_content(spam_check_source, field_name):
spam_check_data = spam_check_source[field_name]
"""
Social fields are formatted differently when coming from the serializer or save
"""
spam_check_content = []
for entry in spam_check_data:
if spam_check_source:
if field_name == 'social':
from_save = spam_check_source.get('profileWebsites', [])
from_serializer = spam_check_source.get('social', {}).get('profileWebsites', [])
return from_serializer or from_save

for spam_check_subfield in OSFUser.SPAM_USER_PROFILE_FIELDS[field_name]:
subfield_content = entry.get(spam_check_subfield)
if subfield_content:
spam_check_content.append(subfield_content)
if spam_check_subfield in spam_check_source:
spam_check_content.append(spam_check_source.get(spam_check_subfield))

return spam_check_content
2 changes: 1 addition & 1 deletion osf/utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def get_headers_from_request(req):
k: v
for k, v in headers.items()
}
headers['Remote-Addr'] = req.remote_addr
headers['Remote-Addr'] = getattr(req, 'remote_addr', None)
return headers


Expand Down
21 changes: 16 additions & 5 deletions osf_tests/management_commands/test_backfill_domain_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def test_wiki(self, spam_domain):
def test_user(self, spam_domain):
user = UserFactory()
user.social['profileWebsites'] = [spam_domain.geturl()]
user.save()
with mock.patch.object(backfill_task.spam_tasks.requests, 'head'):
user.save()
return user

@pytest.mark.enable_enqueue_task
Expand Down Expand Up @@ -223,6 +224,8 @@ def test_backfill_comment_domain_references__resources_without_domains_ignored(s

@pytest.mark.enable_enqueue_task
def test_backfill_user_domain_references(self, test_user, spam_domain):
# delete DomainReference's created on test_user save()
DomainReference.objects.all().delete()
assert DomainReference.objects.count() == 0
with mock.patch.object(backfill_task.spam_tasks.requests, 'head'):
backfill_task.backfill_domain_references(model_name='osf.OSFUser')
Expand All @@ -236,17 +239,25 @@ def test_backfill_user_domain_references(self, test_user, spam_domain):

@pytest.mark.enable_enqueue_task
def test_backfill_user_domain_references__only_selected_once(self, test_user):
# Delete domains created on user save to simulate a backfill
NotableDomain.objects.all().delete()

with mock.patch.object(backfill_task.spam_tasks.requests, 'head'):
initial_resource_count = backfill_task.backfill_domain_references(model_name='osf.OSFUser')
subsequent_resource_count = backfill_task.backfill_domain_references(model_name='osf.OSFUser')
assert initial_resource_count == 1
assert subsequent_resource_count == 0

@pytest.mark.enable_enqueue_task
def test_backfill_user_domain_references__resources_without_domains_ignored(self, test_user):
# User without links
UserFactory()
def test_backfill_user_domain_references__resources_without_domains_ignored(self, test_user, spam_domain):
# Delete domains created on user save to simulate a backfill
NotableDomain.objects.all().delete()

# User without links, to be ignored
with mock.patch.object(backfill_task.spam_tasks.requests, 'head'):
UserFactory().save()

with mock.patch.object(backfill_task.spam_tasks.requests, 'head'):
resource_count = backfill_task.backfill_domain_references(model_name='osf.OSFUser')
# Just the test_user retrieved
# Just the test_user counted
assert resource_count == 1
5 changes: 4 additions & 1 deletion osf_tests/test_search_views.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals

import mock
import pytest
from nose.tools import * # noqa: F403

from osf_tests import factories
from tests.base import OsfTestCase
from website.util import api_url_for
from website.views import find_bookmark_collection
from osf.external.spam import tasks as spam_tasks


@pytest.mark.enable_search
Expand Down Expand Up @@ -174,7 +176,8 @@ def test_search_views(self):
'impactStory': user_two.given_name,
'baiduScholar': user_two.given_name
}
user_two.save()
with mock.patch.object(spam_tasks.requests, 'head'):
user_two.save()

user_three = factories.AuthUserFactory(fullname='Janet Umwali')
user_three.social = {
Expand Down
29 changes: 23 additions & 6 deletions osf_tests/test_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@

SessionStore = import_module(django_conf_settings.SESSION_ENGINE).SessionStore

from osf.external.spam import tasks as spam_tasks


pytestmark = pytest.mark.django_db

Expand Down Expand Up @@ -2191,11 +2193,13 @@ def test_validate_social_profile_website_many_different(self):
with open(url_data_path) as url_test_data:
data = json.load(url_test_data)

previous_number_of_domains = NotableDomain.objects.all().count()
fails_at_end = False
for should_pass in data['testsPositive']:
try:
self.user.social = {'profileWebsites': [should_pass]}
self.user.save()
with mock.patch.object(spam_tasks.requests, 'head'):
self.user.save()
assert self.user.social['profileWebsites'] == [should_pass]
except ValidationError:
fails_at_end = True
Expand All @@ -2205,16 +2209,26 @@ def test_validate_social_profile_website_many_different(self):
self.user.social = {'profileWebsites': [should_fail]}
try:
with pytest.raises(ValidationError):
self.user.save()
with mock.patch.object(spam_tasks.requests, 'head'):
self.user.save()
except AssertionError:
fails_at_end = True
print('\"' + should_fail + '\" passed but should have failed while testing that the validator ' + data['testsNegative'][should_fail])
if fails_at_end:
raise

# Not all domains that are permissable are possible to use as spam,
# some are correctly not extracted and not kept in notable domain so spot
# check some, not all, because not all `testsPositive` urls should be in
# NotableDomains
assert NotableDomain.objects.all().count() > previous_number_of_domains
assert NotableDomain.objects.get(domain='definitelyawebsite.com')
assert NotableDomain.objects.get(domain='a.b-c.de')

def test_validate_multiple_profile_websites_valid(self):
self.user.social = {'profileWebsites': ['http://cos.io/', 'http://thebuckstopshere.com', 'http://dinosaurs.com']}
self.user.save()
with mock.patch.object(spam_tasks.requests, 'head'):
self.user.save()
assert self.user.social['profileWebsites'] == ['http://cos.io/', 'http://thebuckstopshere.com', 'http://dinosaurs.com']

def test_validate_social_profile_websites_invalid(self):
Expand All @@ -2233,7 +2247,8 @@ def test_empty_social_links(self):

def test_profile_website_unchanged(self):
self.user.social = {'profileWebsites': ['http://cos.io/']}
self.user.save()
with mock.patch.object(spam_tasks.requests, 'head'):
self.user.save()
assert self.user.social_links['profileWebsites'] == ['http://cos.io/']
assert len(self.user.social_links) == 1

Expand All @@ -2244,7 +2259,8 @@ def test_various_social_handles(self):
'github': ['CenterForOpenScience'],
'scholar': 'ztt_j28AAAAJ'
}
self.user.save()
with mock.patch.object(spam_tasks.requests, 'head'):
self.user.save()
assert self.user.social_links == {
'profileWebsites': ['http://cos.io/'],
'twitter': 'http://twitter.com/OSFramework',
Expand All @@ -2258,7 +2274,8 @@ def test_multiple_profile_websites(self):
'twitter': ['OSFramework'],
'github': ['CenterForOpenScience']
}
self.user.save()
with mock.patch.object(spam_tasks.requests, 'head'):
self.user.save()
assert self.user.social_links == {
'profileWebsites': ['http://cos.io/', 'http://thebuckstopshere.com', 'http://dinosaurs.com'],
'twitter': 'http://twitter.com/OSFramework',
Expand Down
29 changes: 18 additions & 11 deletions tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
QuickFilesNode,
NotableDomain
)
from osf.external.spam import tasks as spam_tasks

from tests.base import (
assert_is_redirect,
Expand Down Expand Up @@ -1140,16 +1141,19 @@ def test_unserialize_social(self):
'twitter': 'howtopizza',
'github': 'frozenpizzacode',
}
self.app.put_json(
url,
payload,
auth=self.user.auth,
)
with mock.patch.object(spam_tasks.requests, 'head'):
resp = self.app.put_json(
url,
payload,
auth=self.user.auth,
)

self.user.reload()
for key, value in payload.items():
assert_equal(self.user.social[key], value)
assert_true(self.user.social['researcherId'] is None)

assert NotableDomain.objects.all()
assert NotableDomain.objects.get(domain='frozen.pizza.com')

# Regression test for help-desk ticket
Expand Down Expand Up @@ -1187,7 +1191,8 @@ def test_unserialize_social_validation_failure(self):
def test_serialize_social_editable(self):
self.user.social['twitter'] = 'howtopizza'
self.user.social['profileWebsites'] = ['http://www.cos.io', 'http://www.osf.io', 'http://www.wordup.com']
self.user.save()
with mock.patch.object(spam_tasks.requests, 'head'):
self.user.save()
url = api_url_for('serialize_social')
res = self.app.get(
url,
Expand All @@ -1202,12 +1207,14 @@ def test_serialize_social_not_editable(self):
user2 = AuthUserFactory()
self.user.social['twitter'] = 'howtopizza'
self.user.social['profileWebsites'] = ['http://www.cos.io', 'http://www.osf.io', 'http://www.wordup.com']
self.user.save()
with mock.patch.object(spam_tasks.requests, 'head'):
self.user.save()
url = api_url_for('serialize_social', uid=self.user._id)
res = self.app.get(
url,
auth=user2.auth,
)
with mock.patch.object(spam_tasks.requests, 'head'):
res = self.app.get(
url,
auth=user2.auth,
)
assert_equal(res.json.get('twitter'), 'howtopizza')
assert_equal(res.json.get('profileWebsites'), ['http://www.cos.io', 'http://www.osf.io', 'http://www.wordup.com'])
assert_true(res.json.get('github') is None)
Expand Down
1 change: 0 additions & 1 deletion website/profile/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,6 @@ def unserialize_social(auth, **kwargs):

try:
user.save()
user.check_spam(saved_fields=None, request_headers=request.headers)
except ValidationError as exc:
raise HTTPError(http_status.HTTP_400_BAD_REQUEST, data=dict(
message_long=exc.messages[0]
Expand Down

0 comments on commit e577e5d

Please sign in to comment.