docs: add bricks training notebook (#211)

* added bricks notebook * more unicode quotes; isd dataframe column fix * fix remove_punctuation docs * typo fixes * put staging bricks in code
Unstructured-IO · Feb 10, 2023 · f890972 · f890972
1 parent d0c6d50
commit f890972
Show file tree

Hide file tree

Showing 8 changed files with 767 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,11 @@
-## 0.4.7-dev3
+## 0.4.7-dev4
 
 * Added the ability to pull an HTML document from a url in `partition_html`.
 * Added the the ability to get file summary info from lists of filenames and lists
   of file contents.
 * Added optional page break to `partition` for `.pptx`, `.pdf`, images, and `.html` files.
 * Added `to_dict` method to document elements.
+* Include more unicode quotes in `replace_unicode_quotes`.
 
 ## 0.4.6
 

diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -587,10 +587,7 @@ Examples:
   from unstructured.cleaners.core import remove_punctuation
 
   # Returns "A lovely quote"
-  replace_unicode_characters("“A lovely quote!”")
-
-  # Returns ""
-  replace_unicode_characters("'()[]{};:'\",.?/\\-_")
+  remove_punctuation("“A lovely quote!”")
 
 
 ``clean_prefix``

diff --git a/example-docs/layout-parser-paper-fast.jpg b/example-docs/layout-parser-paper-fast.jpg
diff --git a/examples/training/1-Intro to Bricks.ipynb b/examples/training/1-Intro to Bricks.ipynb
diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py
@@ -48,7 +48,7 @@ def test_convert_to_isd_csv(output_csv_file):
         isd_csv_string = base.convert_to_isd_csv(elements)
         csv_file.write(isd_csv_string)
 
-    fieldnames = ["type", "text", "coordinates", "element_id"]
+    fieldnames = ["type", "text"]
     with open(output_csv_file, "r") as csv_file:
         csv_rows = csv.DictReader(csv_file)
         assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.7-dev3"  # pragma: no cover
+__version__ = "0.4.7-dev4"  # pragma: no cover
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
@@ -55,11 +55,31 @@ def replace_unicode_quotes(text) -> str:
     -------
     \x93What a lovely quote!\x94 -> “What a lovely quote!”
     """
+    # NOTE(robinson) - We should probably make this something more sane like a regex
+    # instead of a whole big series of replaces
     text = text.replace("\x91", "‘")
     text = text.replace("\x92", "’")
     text = text.replace("\x93", "“")
     text = text.replace("\x94", "”")
     text = text.replace("&apos;", "'")
+    text = text.replace("â\x80\x99", "'")
+    text = text.replace("â\x80“", "—")
+    text = text.replace("â\x80”", "–")
+    text = text.replace("â\x80˜", "‘")
+    text = text.replace("â\x80¦", "…")
+    text = text.replace("â\x80™", "’")
+    text = text.replace("â\x80œ", "“")
+    text = text.replace("â\x80?", "”")
+    text = text.replace("â\x80ť", "”")
+    text = text.replace("â\x80ś", "“")
+    text = text.replace("â\x80¨", "—")
+    text = text.replace("â\x80ł", "″")
+    text = text.replace("â\x80Ž", "")
+    text = text.replace("â\x80‚", "")
+    text = text.replace("â\x80‰", "")
+    text = text.replace("â\x80‹", "")
+    text = text.replace("â\x80", "")
+    text = text.replace("â\x80s'", "")
     return text
 
 

diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
@@ -38,7 +38,7 @@ def convert_to_isd_csv(elements: List[Text]) -> str:
     Returns the representation of document elements as an Initial Structured Document (ISD)
     in CSV Format.
     """
-    csv_fieldnames: List[str] = ["type", "text", "coordinates", "element_id"]
+    csv_fieldnames: List[str] = ["type", "text"]
     rows: List[Dict[str, str]] = convert_to_isd(elements)
     with io.StringIO() as buffer:
         csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.4.7-dev3" # pragma: no cover
		__version__ = "0.4.7-dev4" # pragma: no cover