diff --git a/README.md b/README.md index 84ce801c..58937010 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ To create them you need an example html files you want to extract. You can use the following command to get html files from the CommonCrawl dataset: ```bash -$ cmon download --match_type=domain --limit=100 html_output html example.com +$ cmon download --match_type=domain --limit=100 html_output example.com html ``` This will download a first 100 html files from example.com and save them in html_output. @@ -106,7 +106,7 @@ In our case the config would look like this: To test the extraction, you can use the following command: ```bash -$ cmon extract config.json extracted_output html html_output/*.html +$ cmon extract config.json extracted_output html_output/*.html html ``` ### Crawl the sites @@ -117,16 +117,16 @@ To do this you will proceed in two steps: To do this, you can use the following command: ```bash -cmon download --match_type=domain --limit=100000 dr_output record example.com +cmon download --match_type=domain --limit=100 dr_output example.com record ``` -This will download the first 100000 records from example.com and save them in dr_output. By default it saves 100_000 records per file, you can change this with the --max_crawls_per_file option. +This will download the first 100 records from example.com and save them in dr_output. By default it saves 100_000 records per file, you can change this with the --max_crawls_per_file option. #### 2. Extract the records Once you have the records, you can use the following command to extract them: ```bash -$ cmon extract --n_proc=4 config.json extracted_output record dr_output/*.jsonl +$ cmon extract --n_proc=4 config.json extracted_output dr_output/*.jsonl record ``` Note that you can use the --n_proc option to specify the number of processes to use for the extraction. Multiprocessing is done on file level, so if you have just one file it will not be used. diff --git a/cmoncrawl/integrations/download.py b/cmoncrawl/integrations/download.py index 3f96ea9b..49bcca1a 100644 --- a/cmoncrawl/integrations/download.py +++ b/cmoncrawl/integrations/download.py @@ -81,13 +81,9 @@ def add_mode_args(subparser: Any): def add_args(subparser: Any): parser = subparser.add_parser("download", help="Download data from Common Crawl") parser.add_argument("output", type=Path, help="Path to output directory") - mode_subparser = parser.add_subparsers( - dest="mode", required=True, help="Download mode" - ) parser.add_argument( "urls", type=str, nargs="+", help="URLs to download, e.g. www.bcc.cz." ) - mode_subparser = add_mode_args(mode_subparser) parser.add_argument( "--limit", type=int, default=5, help="Max number of urls to download" ) @@ -155,6 +151,10 @@ def add_args(subparser: Any): default=None, help="S3 bucket to use for Athena. If set, the query results will be stored in the bucket and reused for later queries. Make sure to delete the bucket afterwards.", ) + mode_subparser = parser.add_subparsers( + dest="mode", required=True, help="Download mode" + ) + mode_subparser = add_mode_args(mode_subparser) parser.set_defaults(func=run_download) @@ -316,10 +316,15 @@ async def url_download( def run_download(args: argparse.Namespace): mode = DownloadOutputFormat(args.mode) - encoding = args.encoding if mode == DownloadOutputFormat.HTML else None + # Record exlusives max_crawls_per_file = ( args.max_crawls_per_file if mode == DownloadOutputFormat.RECORD else 1 ) + # HTML exlusives + encoding = args.encoding if mode == DownloadOutputFormat.HTML else None + download_method = ( + DAOname(args.download_method) if mode == DownloadOutputFormat.HTML else None + ) return asyncio.run( url_download( urls=args.urls, @@ -337,7 +342,7 @@ def run_download(args: argparse.Namespace): aggregator_type=args.aggregator, max_directory_size=args.max_directory_size, filter_non_200=args.filter_non_200, - download_method=args.download_method, + download_method=download_method, s3_bucket=args.s3_bucket, ) ) diff --git a/cmoncrawl/integrations/extract.py b/cmoncrawl/integrations/extract.py index 5f054d79..01e927d5 100644 --- a/cmoncrawl/integrations/extract.py +++ b/cmoncrawl/integrations/extract.py @@ -102,14 +102,14 @@ def add_args(subparser: Any): default=1, help="Number of processes to use for extraction. The paralelization is on file level, thus for single file it's useless to use more than one process.", ) + parser.add_argument( + "files", nargs="+", type=Path, help="Files to extract data from" + ) mode_subparser = parser.add_subparsers( dest="mode", required=True, help="Extraction mode" ) mode_subparser = add_mode_args(mode_subparser) - parser.add_argument( - "files", nargs="+", type=Path, help="Files to extract data from" - ) parser.set_defaults(func=run_extract) @@ -216,24 +216,34 @@ def _extract_task( args: argparse.Namespace, ): mode = ExtractMode(args.mode) - download_method = DAOname(args.download_method) if args.download_method else None # We have to setup loggers / aws in each process setup_loggers(args.verbosity) CONFIG.update_from_cli(args) + # HTML exlusives + url = args.url if mode == ExtractMode.HTML else None + date = args.date if mode == ExtractMode.HTML else None + + # Record exclusives + max_retry = args.max_retry if mode == ExtractMode.RECORD else 0 + sleep_base = args.sleep_base if mode == ExtractMode.RECORD else 0 + download_method = ( + DAOname(args.download_method) if mode == ExtractMode.RECORD else None + ) + asyncio.run( extract_from_files( output_path=output_path, config=config, files=files, mode=mode, - url=args.url if mode == ExtractMode.HTML else None, - date=args.date if mode == ExtractMode.HTML else None, + url=url, + date=date, max_directory_size=args.max_directory_size, max_crawls_per_file=args.max_crawls_per_file, - max_retry=args.max_retry if mode == ExtractMode.RECORD else 0, - sleep_base=args.sleep_base if mode == ExtractMode.RECORD else 0, + max_retry=max_retry, + sleep_base=sleep_base, download_method=download_method, ) ) diff --git a/docs/source/cli/cli.rst b/docs/source/cli/cli.rst index 345bb331..aa47bb46 100644 --- a/docs/source/cli/cli.rst +++ b/docs/source/cli/cli.rst @@ -31,10 +31,10 @@ Examples cmon download --match_type=domain --limit=100 html_output html example.com # Take the domain records downloaded using the first command and extracts them using your extractors - cmon extract config.json extracted_output record dr_output/*.jsonl + cmon extract config.json extracted_output dr_output/*.jsonl record # Take the htmls downloaded using the second command and extracts them using your extractors - cmon extract config.json extracted_output html html_output/*.html + cmon extract config.json extracted_output html_output/*.html html diff --git a/docs/source/cli/extract.rst b/docs/source/cli/extract.rst index 4eb73f06..220a2178 100644 --- a/docs/source/cli/extract.rst +++ b/docs/source/cli/extract.rst @@ -77,10 +77,10 @@ Examples .. code-block:: bash # Take the domain records downloaded using the first command and extracts them using your extractors - cmon extract config.json extracted_output record --max_retry 100 --download_method=gateway --sleep_base 1.3 dr_output/*.jsonl + cmon extract config.json extracted_output dr_output/*.jsonl record --max_retry 100 --download_method=gateway --sleep_base 1.3 # Take the htmls downloaded using the second command and extracts them using your extractors - cmon extract config.json extracted_output html --date 2021-01-01 --url https://www.example.com html_output/*.html + cmon extract config.json extracted_output html_output/*.html html --date 2021-01-01 --url https://www.example.com When you are going to build the extractors, you will appreciate that you can specify what the URL of the HTML file is and what the date of the extraction is. This is because