diff --git a/README.rst b/README.rst index c4bac6d..9fe7e2e 100644 --- a/README.rst +++ b/README.rst @@ -35,20 +35,11 @@ This package provides an iterator over the dataset stored at Google. It decompresses the data on the fly and provides you the access to the underlying data. +Features +======== -Example use -=========== - ->>> from google_ngram_downloader import readline_google_store ->>> ->>> fname, url, records = next(readline_google_store(ngram_len=5)) ->>> fname -'googlebooks-eng-all-5gram-20120701-0.gz' ->>> url -'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-5gram-20120701-0.gz' ->>> next(records) -Record(ngram=u'0 " A most useful', year=1860, match_count=1, volume_count=1) - +* Download ngrams of various length and languages. +* Access to part of ngrams, e.g. ones that start with an 'a'. Installation ============ @@ -62,6 +53,28 @@ The command line tool ===================== It also provides a simple command line tool to download the ngrams called -`google-ngram-downloader`. +`google-ngram-downloader`. Refer to the help to see available actions:: + + google-ngram-downloader help + usage: google-ngram-downloader [options] + commands: + cooccurrence Write the cooccurrence frequencies of a word and its contexts. + download Download The Google Books Ngram Viewer dataset version 20120701. + help Show help for a given help topic or a help overview. + readline Print the raw content. + + +Example use of the API +====================== + +>>> from google_ngram_downloader import readline_google_store +>>> +>>> fname, url, records = next(readline_google_store(ngram_len=5)) +>>> fname +'googlebooks-eng-all-5gram-20120701-0.gz' +>>> url +'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-5gram-20120701-0.gz' +>>> next(records) +Record(ngram=u'0 " A most useful', year=1860, match_count=1, volume_count=1) diff --git a/google_ngram_downloader/__main__.py b/google_ngram_downloader/__main__.py index 8dd32b7..81ef84c 100644 --- a/google_ngram_downloader/__main__.py +++ b/google_ngram_downloader/__main__.py @@ -18,7 +18,11 @@ def download( ngram_len=('n', 1, 'The length of ngrams to be downloaded.'), output=('o', 'downloads/google_ngrams/{ngram_len}', 'The destination folder for downloaded files.'), verbose=('v', False, 'Be verbose.'), - lang=('l', "eng", 'Language: eng'), + lang=( + 'l', + 'eng', + 'Language. [eng|eng-us|eng-gb|eng-fiction|chi-sim|fre|ger|heb|ita|rus|spa]', + ), ): """Download The Google Books Ngram Viewer dataset version 20120701.""" output = local(output.format(ngram_len=ngram_len)) @@ -39,10 +43,18 @@ def cooccurrence( output=('o', 'downloads/google_ngrams/{ngram_len}_cooccurrence', 'The destination folder for downloaded files.'), verbose=('v', False, 'Be verbose.'), rewrite=('r', False, 'Always rewrite existing files.'), - records_in_file=('', 50000000, 'The number of records to be read from the Google store to store in a .json.gz file.'), - lang=('l', "eng", 'Language: eng'), + records_in_file=( + '', + 50000000, + 'The number of records to be read from the Google store to store in a .json.gz file.', + ), + lang=( + 'l', + 'eng', + 'Language. [eng|eng-us|eng-gb|eng-fiction|chi-sim|fre|ger|heb|ita|rus|spa]', + ), ): - """Write the cooccurrence frequncis of a word and its contexts.""" + """Write the cooccurrence frequencies of a word and its contexts.""" assert ngram_len > 1 output_dir = local(output.format(ngram_len=ngram_len)) output_dir.ensure_dir() @@ -84,7 +96,11 @@ def cooccurrence( @command() def readline( ngram_len=('n', 2, 'The length of ngrams to be downloaded.'), - lang=('l', "eng", 'Language: eng'), + lang=( + 'l', + 'eng', + 'Language. [eng|eng-us|eng-gb|eng-fiction|chi-sim|fre|ger|heb|ita|rus|spa]', + ), ): """Print the raw content.""" diff --git a/google_ngram_downloader/util.py b/google_ngram_downloader/util.py index d7d5e0b..ecdefa5 100644 --- a/google_ngram_downloader/util.py +++ b/google_ngram_downloader/util.py @@ -20,9 +20,10 @@ def readline_google_store(ngram_len, lang='eng', indices=None, chunk_size=1024 * """Iterate over the data in the Google ngram collectioin. :param int ngram_len: the length of ngrams to be streamed. + :param str lang: the langueage of the ngrams. + :param iter indices: the file indices to be downloaded. :param int chunk_size: the size the chunks of raw compressed data. - :param bool verbose: if `True`, then the debug information is shown to - `sys.stderr`. + :param bool verbose: if `True`, then the debug information is shown to `sys.stderr`. :returns: a iterator over triples `(fname, url, records)` @@ -89,7 +90,14 @@ def count_coccurrence(records, index): def iter_google_store(ngram_len, lang="eng", indices=None, verbose=False): - """Iterate over the collection files stored at Google.""" + """Iterate over the collection files stored at Google. + + :param int ngram_len: the length of ngrams to be streamed. + :param str lang: the langueage of the ngrams. + :param iter indices: the file indices to be downloaded. + :param bool verbose: if `True`, then the debug information is shown to `sys.stderr`. + + """ version = '20120701' session = requests.Session()