From 5758194b88750c7ee27fc0477b58724b91db094e Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 17:33:04 +0200 Subject: [PATCH] tests: add tests for `scrapy-clawl-once` * Adds: tests about `scrapy-clawl-once` for FTP and FILE. Addresses #161 Signed-off-by: Spiros Delviniotis --- tests/functional/arxiv/test_arxiv.py | 50 +++++++++++++- tests/functional/wsp/test_wsp.py | 98 +++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index 0538980e..76e8d705 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -75,7 +75,7 @@ def test_arxiv(set_up_local_environment, expected_results): results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, - monitor_iter_limit=100, + monitor_iter_limit=20, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='arXiv', @@ -87,3 +87,51 @@ def test_arxiv(set_up_local_environment, expected_results): expected_results = [override_generated_fields(expected) for expected in expected_results] assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'arxiv', + 'fixtures', + 'arxiv_smoke_record.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_arxiv_crawl_twice(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='arXiv', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='arXiv', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index 541b51c2..a97f77ef 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -44,7 +44,7 @@ def set_up_ftp_environment(): ) # The test must wait until the docker environment is up (takes about 10 seconds). - sleep(10) + sleep(7) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', @@ -125,6 +125,54 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): assert gotten_results == expected_results +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'wsp', + 'fixtures', + 'wsp_smoke_records.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): + crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] + + @pytest.mark.parametrize( 'expected_results', [ @@ -156,3 +204,51 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): expected_results = [override_generated_fields(expected) for expected in expected_results] assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'wsp', + 'fixtures', + 'wsp_smoke_records.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == []