Skip to content

Commit

Permalink
docs: fixed multiprocessing in tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
KennethEnevoldsen committed Jan 13, 2023
1 parent c0fb63c commit 4ddebdf
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions docs/tutorials/filter_corpus_using_quality.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -769,7 +769,7 @@
"quality_pipe = nlp.add_pipe(\"textdescriptives/quality\")\n",
"\n",
"# 3. Apply the pipeline to the legal documents\n",
"legal_docs = nlp.pipe(legal[\"text\"], batch_size=100, n_process=4)"
"legal_docs = nlp.pipe(legal[\"text\"], batch_size=100, n_process=1)"
]
},
{
Expand Down Expand Up @@ -866,7 +866,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 43,
"metadata": {},
"outputs": [
{
Expand All @@ -875,7 +875,7 @@
"QualityOutput(passed=False, n_stop_words=ThresholdsOutput(value=192.0, passed=True, threshold=(2.0, None)), alpha_ratio=ThresholdsOutput(value=0.8, passed=True, threshold=(0.7, None)), mean_word_length=ThresholdsOutput(value=4.55, passed=True, threshold=(3.0, 10.0)), doc_length=ThresholdsOutput(value=500.0, passed=True, threshold=(10.0, 100000.0)), symbol_to_word_ratio={'#': ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.1))}, proportion_ellipsis=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.3)), proportion_bullet_points=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.8)), contains={'lorem ipsum': ThresholdsOutput(value=0.0, passed=True, threshold=False)}, duplicate_line_chr_fraction=ThresholdsOutput(value=0.26, passed=False, threshold=(None, 0.2)), duplicate_paragraph_chr_fraction=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.2)), duplicate_ngram_chr_fraction={'5': ThresholdsOutput(value=0.54, passed=False, threshold=(None, 0.15)), '6': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.14)), '7': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.13)), '8': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.12)), '9': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.11)), '10': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.1))}, top_ngram_chr_fraction={'2': ThresholdsOutput(value=0.02, passed=True, threshold=(None, 0.2)), '3': ThresholdsOutput(value=0.04, passed=True, threshold=(None, 0.18)), '4': ThresholdsOutput(value=0.07, passed=True, threshold=(None, 0.16))})"
]
},
"execution_count": 22,
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1035,9 +1035,9 @@
"outputs": [],
"source": [
"# first we apply the pipeline to the other domains\n",
"news_docs = nlp.pipe(news[\"text\"], batch_size=100, n_process=4)\n",
"news_docs = nlp.pipe(news[\"text\"], batch_size=100, n_process=1)\n",
"news_docs = list(news_docs)\n",
"speech_docs = nlp.pipe(speech[\"text\"], batch_size=100, n_process=4)\n",
"speech_docs = nlp.pipe(speech[\"text\"], batch_size=100, n_process=1)\n",
"speech_docs = list(speech_docs)"
]
},
Expand Down Expand Up @@ -1149,7 +1149,7 @@
"source": [
"From this we can see that a high proportion of the tokens in the speech dataset dentoes the speaker such and tokens such as `:` then lower the alpa ratio. This might or might not be problematic for the task at hand.\n",
"\n",
"**Therefore it is important to note that while these filters are useful for filtering large amount of texts it is also important to know that they should probably be adjusted to the target domain.**"
"**Therefore it is important to note that while these filters are useful for filtering large amount of texts it is also important to know that they should be adjusted to the target domain.**"
]
},
{
Expand Down

0 comments on commit 4ddebdf

Please sign in to comment.