From 82e8ee6d33d8a3e9b33dc96140919a7a1b45805c Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 21 Mar 2022 18:09:45 +0100 Subject: [PATCH 1/6] fix tutorial 4 dataset path --- haystack/telemetry.py | 2 +- tutorials/Tutorial4_FAQ_style_QA.ipynb | 2 +- tutorials/Tutorial4_FAQ_style_QA.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/telemetry.py b/haystack/telemetry.py index 99f92ae047..5ae12e0050 100644 --- a/haystack/telemetry.py +++ b/haystack/telemetry.py @@ -190,7 +190,7 @@ def send_tutorial_event(url: str): "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip": "1", "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/squad_small.json.zip": "2", "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip": "3", - "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/faq_covidbert.csv.zip": "4", + "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip": "4", "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip": "5", "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip": "6", "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip": "7", diff --git a/tutorials/Tutorial4_FAQ_style_QA.ipynb b/tutorials/Tutorial4_FAQ_style_QA.ipynb index c6c59b41d2..c1c50dad5f 100644 --- a/tutorials/Tutorial4_FAQ_style_QA.ipynb +++ b/tutorials/Tutorial4_FAQ_style_QA.ipynb @@ -225,7 +225,7 @@ "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n", "\n", "# Get dataframe with columns \"question\", \"answer\" and some custom metadata\n", - "df = pd.read_csv(\"small_faq_covid.csv\")\n", + "df = pd.read_csv(f\"{doc_dir}/small_faq_covid.csv\")\n", "# Minimal cleaning\n", "df.fillna(value=\"\", inplace=True)\n", "df[\"question\"] = df[\"question\"].apply(lambda x: x.strip())\n", diff --git a/tutorials/Tutorial4_FAQ_style_QA.py b/tutorials/Tutorial4_FAQ_style_QA.py index ecd31ed334..83f7d7ef95 100755 --- a/tutorials/Tutorial4_FAQ_style_QA.py +++ b/tutorials/Tutorial4_FAQ_style_QA.py @@ -58,7 +58,7 @@ def tutorial4_faq_style_qa(): fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Get dataframe with columns "question", "answer" and some custom metadata - df = pd.read_csv("small_faq_covid.csv") + df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv") # Minimal cleaning df.fillna(value="", inplace=True) df["question"] = df["question"].apply(lambda x: x.strip()) From 59f6c5dc7a355dd0a55d8b453010de453956bccc Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 21 Mar 2022 18:22:07 +0100 Subject: [PATCH 2/6] fix tutorial 8 dataset path --- tutorials/Tutorial8_Preprocessing.ipynb | 9 +++++---- tutorials/Tutorial8_Preprocessing.py | 10 ++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/tutorials/Tutorial8_Preprocessing.ipynb b/tutorials/Tutorial8_Preprocessing.ipynb index 87c170e698..996c4c9c40 100644 --- a/tutorials/Tutorial8_Preprocessing.ipynb +++ b/tutorials/Tutorial8_Preprocessing.ipynb @@ -62,6 +62,7 @@ "#! pip install farm-haystack\n", "\n", "# Install the latest master of Haystack\n", + "from pathlib import Path\n", "!pip install --upgrade pip\n", "!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]\n", "\n", @@ -163,13 +164,13 @@ "# Here are some examples of how you would use file converters\n", "\n", "converter = TextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n", - "doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)[0]\n", + "doc_txt = converter.convert(file_path=Path(f\"{doc_dir}/classics.txt\"), meta=None)[0]\n", "\n", "converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n", - "doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)[0]\n", + "doc_pdf = converter.convert(file_path=Path(f\"{doc_dir}/bert.pdf\"), meta=None)[0]\n", "\n", "converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=[\"en\"])\n", - "doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)[0]" + "doc_docx = converter.convert(file_path=Path(f\"{doc_dir}/heavy_metal.docx\"), meta=None)[0]" ] }, { @@ -195,7 +196,7 @@ "source": [ "# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.\n", "\n", - "all_docs = convert_files_to_dicts(dir_path=\"data/preprocessing_tutorial\")" + "all_docs = convert_files_to_dicts(dir_path=doc_dir)" ] }, { diff --git a/tutorials/Tutorial8_Preprocessing.py b/tutorials/Tutorial8_Preprocessing.py index 095a083386..ef3d9eba09 100644 --- a/tutorials/Tutorial8_Preprocessing.py +++ b/tutorials/Tutorial8_Preprocessing.py @@ -18,6 +18,8 @@ """ # Here are the imports we need +from pathlib import Path + from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor from haystack.utils import convert_files_to_dicts, fetch_archive_from_http @@ -42,17 +44,17 @@ def tutorial8_preprocessing(): # Here are some examples of how you would use file converters converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) - doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0] + doc_txt = converter.convert(file_path=Path(f"{doc_dir}/classics.txt"), meta=None)[0] converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) - doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0] + doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/bert.pdf"), meta=None)[0] converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"]) - doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0] + doc_docx = converter.convert(file_path=Path(f"{doc_dir}/heavy_metal.docx"), meta=None)[0] # Haystack also has a convenience function that will automatically apply the right converter to each file in a directory. - all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial") + all_docs = convert_files_to_dicts(dir_path=doc_dir) """ From 27ac021b45812cf91cbf4d52db1155fe6ee54016 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 21 Mar 2022 18:38:33 +0100 Subject: [PATCH 3/6] fix tutorial 10 event --- haystack/utils/import_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/utils/import_utils.py b/haystack/utils/import_utils.py index 47fe199115..d8c9497756 100644 --- a/haystack/utils/import_utils.py +++ b/haystack/utils/import_utils.py @@ -69,7 +69,7 @@ def fetch_archive_from_http(url: str, output_dir: str, proxies: Optional[dict] = if not path.exists(): path.mkdir(parents=True) - if "deepset.ai-farm-qa/datasets" in url or "dl.fbaipublicfiles.com" in url: + if "deepset.ai-farm-qa/datasets" in url or "dl.fbaipublicfiles.com" in url or "fandom-qa.s3" in url: send_tutorial_event(url=url) is_not_empty = len(list(Path(path).rglob("*"))) > 0 From b6be61ba9ae98a68706e9163de90f7b2977ea1d5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 21 Mar 2022 17:38:58 +0000 Subject: [PATCH 4/6] Update Documentation & Code Style --- docs/_src/tutorials/tutorials/4.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md index c38a2e74e8..3fa80497fd 100644 --- a/docs/_src/tutorials/tutorials/4.md +++ b/docs/_src/tutorials/tutorials/4.md @@ -131,7 +131,7 @@ s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/docu fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Get dataframe with columns "question", "answer" and some custom metadata -df = pd.read_csv("small_faq_covid.csv") +df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv") # Minimal cleaning df.fillna(value="", inplace=True) df["question"] = df["question"].apply(lambda x: x.strip()) From 2d03983ad3cd31d650845f0a65ce5ef904d1d5cf Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 21 Mar 2022 21:48:51 +0100 Subject: [PATCH 5/6] fix send event for tutorial 15 --- haystack/telemetry.py | 2 +- tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/telemetry.py b/haystack/telemetry.py index 5ae12e0050..da257ae50c 100644 --- a/haystack/telemetry.py +++ b/haystack/telemetry.py @@ -202,7 +202,7 @@ def send_tutorial_event(url: str): "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip": "12", # Tutorial 13: no dataset available yet "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip": "14", - "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_tables_sample.json.zip": "15", + "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip": "15", # "https://nlp.stanford.edu/data/glove.6B.zip": "16", "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip": "16", } diff --git a/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb b/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb index 369dd6a70c..e5c2a30f66 100644 --- a/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb +++ b/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb @@ -46,7 +46,7 @@ "\n", "# Install the latest master of Haystack\n", "!pip install --upgrade pip\n", - "!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]\n", + "!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab, ocr]\n", "\n", "!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz\n", "!tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin\n", From 0c2bf58b7684ae0292eee3d276412fe18fd81d2f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 21 Mar 2022 21:28:59 +0000 Subject: [PATCH 6/6] Update Documentation & Code Style --- docs/_src/tutorials/tutorials/16.md | 2 +- docs/_src/tutorials/tutorials/8.md | 10 ++++++---- tutorials/Tutorial8_Preprocessing.ipynb | 1 + 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/_src/tutorials/tutorials/16.md b/docs/_src/tutorials/tutorials/16.md index ff48741c2d..c622091ae7 100644 --- a/docs/_src/tutorials/tutorials/16.md +++ b/docs/_src/tutorials/tutorials/16.md @@ -26,7 +26,7 @@ This tutorial will show you how to integrate a classification model into your pr # Install the latest master of Haystack !pip install --upgrade pip -!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab, ocr] !wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz !tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin diff --git a/docs/_src/tutorials/tutorials/8.md b/docs/_src/tutorials/tutorials/8.md index a08818b15e..ffbd75de36 100644 --- a/docs/_src/tutorials/tutorials/8.md +++ b/docs/_src/tutorials/tutorials/8.md @@ -33,6 +33,8 @@ This tutorial will show you all the tools that Haystack provides to help you cas #! pip install farm-haystack # Install the latest master of Haystack +from pathlib import Path + !pip install --upgrade pip !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr] @@ -69,20 +71,20 @@ For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't # Here are some examples of how you would use file converters converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) -doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0] +doc_txt = converter.convert(file_path=Path(f"{doc_dir}/classics.txt"), meta=None)[0] converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) -doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0] +doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/bert.pdf"), meta=None)[0] converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"]) -doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0] +doc_docx = converter.convert(file_path=Path(f"{doc_dir}/heavy_metal.docx"), meta=None)[0] ``` ```python # Haystack also has a convenience function that will automatically apply the right converter to each file in a directory. -all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial") +all_docs = convert_files_to_dicts(dir_path=doc_dir) ``` ## PreProcessor diff --git a/tutorials/Tutorial8_Preprocessing.ipynb b/tutorials/Tutorial8_Preprocessing.ipynb index 996c4c9c40..6b024c56e6 100644 --- a/tutorials/Tutorial8_Preprocessing.ipynb +++ b/tutorials/Tutorial8_Preprocessing.ipynb @@ -63,6 +63,7 @@ "\n", "# Install the latest master of Haystack\n", "from pathlib import Path\n", + "\n", "!pip install --upgrade pip\n", "!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]\n", "\n",