From 92b0efa970560d9ec9b5d142bb65eba37a7a069d Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 17:33:04 +0200 Subject: [PATCH] tests: add tests for `scrapy-clawl-once` * Adds: tests about `scrapy-clawl-once` for FTP and FILE. Addresses #161 Signed-off-by: Spiros Delviniotis --- tests/functional/arxiv/test_arxiv.py | 49 ++++++++++++++ tests/functional/wsp/test_wsp.py | 98 +++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 1 deletion(-) diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index 2ef86b87..ce01eecb 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -88,3 +88,52 @@ def test_arxiv(set_up_local_environment, expected_results): expected_results = [override_generated_fields(expected) for expected in expected_results] assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'arxiv', + 'fixtures', + 'arxiv_smoke_record.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_arxiv_crawl_twice(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + events_limit=1, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='arXiv', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='arXiv', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index ffa4c4c9..d034e7aa 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -44,7 +44,7 @@ def set_up_ftp_environment(): ) # The test must wait until the docker environment is up (takes about 10 seconds). - sleep(10) + sleep(7) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', @@ -126,6 +126,54 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): assert gotten_results == expected_results +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'wsp', + 'fixtures', + 'wsp_smoke_records.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): + crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] + + @pytest.mark.parametrize( 'expected_results', [ @@ -158,3 +206,51 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): expected_results = [override_generated_fields(expected) for expected in expected_results] assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'wsp', + 'fixtures', + 'wsp_smoke_records.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == []