Skip to content

Commit

Permalink
tests: add tests for scrapy-clawl-once
Browse files Browse the repository at this point in the history
* Adds: tests about `scrapy-clawl-once` for FTP and FILE.

Addresses inspirehep#161

Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 21, 2017
1 parent 06e106f commit 5758194
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 2 deletions.
50 changes: 49 additions & 1 deletion tests/functional/arxiv/test_arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_arxiv(set_up_local_environment, expected_results):
results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=100,
monitor_iter_limit=20,
crawler_instance=crawler,
project=set_up_local_environment.get('CRAWLER_PROJECT'),
spider='arXiv',
Expand All @@ -87,3 +87,51 @@ def test_arxiv(set_up_local_environment, expected_results):
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gotten_results == expected_results


@pytest.mark.parametrize(
'expected_results',
[
expected_json_results_from_file(
'arxiv',
'fixtures',
'arxiv_smoke_record.json',
),
],
ids=[
'crawl_twice',
]
)
def test_arxiv_crawl_twice(set_up_local_environment, expected_results):
crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=20,
crawler_instance=crawler,
project=set_up_local_environment.get('CRAWLER_PROJECT'),
spider='arXiv',
settings={},
**set_up_local_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gotten_results == expected_results

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=20,
crawler_instance=crawler,
project=set_up_local_environment.get('CRAWLER_PROJECT'),
spider='arXiv',
settings={},
**set_up_local_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]

assert gotten_results == []
98 changes: 97 additions & 1 deletion tests/functional/wsp/test_wsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def set_up_ftp_environment():
)

# The test must wait until the docker environment is up (takes about 10 seconds).
sleep(10)
sleep(7)

yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
Expand Down Expand Up @@ -125,6 +125,54 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results):
assert gotten_results == expected_results


@pytest.mark.parametrize(
'expected_results',
[
expected_json_results_from_file(
'wsp',
'fixtures',
'wsp_smoke_records.json',
),
],
ids=[
'crawl_twice',
]
)
def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results):
crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=20,
crawler_instance=crawler,
project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gotten_results == expected_results

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=20,
crawler_instance=crawler,
project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]

assert gotten_results == []


@pytest.mark.parametrize(
'expected_results',
[
Expand Down Expand Up @@ -156,3 +204,51 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results):
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gotten_results == expected_results


@pytest.mark.parametrize(
'expected_results',
[
expected_json_results_from_file(
'wsp',
'fixtures',
'wsp_smoke_records.json',
),
],
ids=[
'crawl_twice',
]
)
def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results):
crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=20,
crawler_instance=crawler,
project=set_up_local_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_local_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gotten_results == expected_results

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=20,
crawler_instance=crawler,
project=set_up_local_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_local_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]

assert gotten_results == []

0 comments on commit 5758194

Please sign in to comment.