docs: fixed multiprocessing in tutorial

HLasse · Jan 13, 2023 · 4ddebdf · 4ddebdf
1 parent c0fb63c
commit 4ddebdf
Showing 1 changed file with 7 additions and 7 deletions.
diff --git a/docs/tutorials/filter_corpus_using_quality.ipynb b/docs/tutorials/filter_corpus_using_quality.ipynb
@@ -319,7 +319,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -769,7 +769,7 @@
     "quality_pipe = nlp.add_pipe(\"textdescriptives/quality\")\n",
     "\n",
     "# 3. Apply the pipeline to the legal documents\n",
-    "legal_docs = nlp.pipe(legal[\"text\"], batch_size=100, n_process=4)"
+    "legal_docs = nlp.pipe(legal[\"text\"], batch_size=100, n_process=1)"
    ]
   },
   {
@@ -866,7 +866,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -875,7 +875,7 @@
        "QualityOutput(passed=False, n_stop_words=ThresholdsOutput(value=192.0, passed=True, threshold=(2.0, None)), alpha_ratio=ThresholdsOutput(value=0.8, passed=True, threshold=(0.7, None)), mean_word_length=ThresholdsOutput(value=4.55, passed=True, threshold=(3.0, 10.0)), doc_length=ThresholdsOutput(value=500.0, passed=True, threshold=(10.0, 100000.0)), symbol_to_word_ratio={'#': ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.1))}, proportion_ellipsis=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.3)), proportion_bullet_points=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.8)), contains={'lorem ipsum': ThresholdsOutput(value=0.0, passed=True, threshold=False)}, duplicate_line_chr_fraction=ThresholdsOutput(value=0.26, passed=False, threshold=(None, 0.2)), duplicate_paragraph_chr_fraction=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.2)), duplicate_ngram_chr_fraction={'5': ThresholdsOutput(value=0.54, passed=False, threshold=(None, 0.15)), '6': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.14)), '7': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.13)), '8': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.12)), '9': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.11)), '10': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.1))}, top_ngram_chr_fraction={'2': ThresholdsOutput(value=0.02, passed=True, threshold=(None, 0.2)), '3': ThresholdsOutput(value=0.04, passed=True, threshold=(None, 0.18)), '4': ThresholdsOutput(value=0.07, passed=True, threshold=(None, 0.16))})"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1035,9 +1035,9 @@
    "outputs": [],
    "source": [
     "# first we apply the pipeline to the other domains\n",
-    "news_docs = nlp.pipe(news[\"text\"], batch_size=100, n_process=4)\n",
+    "news_docs = nlp.pipe(news[\"text\"], batch_size=100, n_process=1)\n",
     "news_docs = list(news_docs)\n",
-    "speech_docs = nlp.pipe(speech[\"text\"], batch_size=100, n_process=4)\n",
+    "speech_docs = nlp.pipe(speech[\"text\"], batch_size=100, n_process=1)\n",
     "speech_docs = list(speech_docs)"
    ]
   },
@@ -1149,7 +1149,7 @@
    "source": [
     "From this we can see that a high proportion of the tokens in the speech dataset dentoes the speaker such and tokens such as `:` then lower the alpa ratio. This might or might not be problematic for the task at hand.\n",
     "\n",
-    "**Therefore it is important to note that while these filters are useful for filtering large amount of texts it is also important to know that they should probably be adjusted to the target domain.**"
+    "**Therefore it is important to note that while these filters are useful for filtering large amount of texts it is also important to know that they should be adjusted to the target domain.**"
    ]
   },
   {