diff --git a/incubator-tools/advance_table_line_enhancement/Images/after_line_enhancement_sample.png b/incubator-tools/advance_table_line_enhancement/Images/after_line_enhancement_sample.png new file mode 100644 index 000000000..e05a0a827 Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/after_line_enhancement_sample.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/cde_train_sample.png b/incubator-tools/advance_table_line_enhancement/Images/cde_train_sample.png new file mode 100644 index 000000000..1119f197b Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/cde_train_sample.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/fp_tables_to_csv_output_csv_sample.png b/incubator-tools/advance_table_line_enhancement/Images/fp_tables_to_csv_output_csv_sample.png new file mode 100644 index 000000000..02a6e943f Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/fp_tables_to_csv_output_csv_sample.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/fp_tables_to_csv_output_folder.png b/incubator-tools/advance_table_line_enhancement/Images/fp_tables_to_csv_output_folder.png new file mode 100644 index 000000000..e2ff87328 Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/fp_tables_to_csv_output_folder.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/leb_folder_sample.png b/incubator-tools/advance_table_line_enhancement/Images/leb_folder_sample.png new file mode 100644 index 000000000..3aa34a65e Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/leb_folder_sample.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/line_enhancement_basic_table_img(pg1_tb0.csv).png b/incubator-tools/advance_table_line_enhancement/Images/line_enhancement_basic_table_img(pg1_tb0.csv).png new file mode 100644 index 000000000..917f1cb5c Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/line_enhancement_basic_table_img(pg1_tb0.csv).png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/line_enhancement_basic_table_img.png b/incubator-tools/advance_table_line_enhancement/Images/line_enhancement_basic_table_img.png new file mode 100644 index 000000000..3f7ba2a57 Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/line_enhancement_basic_table_img.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/ocr_walk_input_sample.png b/incubator-tools/advance_table_line_enhancement/Images/ocr_walk_input_sample.png new file mode 100644 index 000000000..6af4c43b3 Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/ocr_walk_input_sample.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/ocr_walk_output_sample.png b/incubator-tools/advance_table_line_enhancement/Images/ocr_walk_output_sample.png new file mode 100644 index 000000000..507fcd36e Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/ocr_walk_output_sample.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/page_merger_input_1.png b/incubator-tools/advance_table_line_enhancement/Images/page_merger_input_1.png new file mode 100644 index 000000000..53eeb1417 Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/page_merger_input_1.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/page_merger_input_2.png b/incubator-tools/advance_table_line_enhancement/Images/page_merger_input_2.png new file mode 100644 index 000000000..4610ca9df Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/page_merger_input_2.png differ diff --git a/incubator-tools/advance_table_line_enhancement/Images/page_merger_output.png b/incubator-tools/advance_table_line_enhancement/Images/page_merger_output.png new file mode 100644 index 000000000..815291a42 Binary files /dev/null and b/incubator-tools/advance_table_line_enhancement/Images/page_merger_output.png differ diff --git a/incubator-tools/advance_table_line_enhancement/README.md b/incubator-tools/advance_table_line_enhancement/README.md new file mode 100644 index 000000000..264193edb --- /dev/null +++ b/incubator-tools/advance_table_line_enhancement/README.md @@ -0,0 +1,142 @@ +# Purpose and Description +This readme files contains information about 5 tools. + +* 1. Line Enhancement Basic Flow : This tool built using Python programming language. It converts tables present in pdf to csv files and stores them in GCS bucket by enhancing input pdf files. + +## Input Details for this tool : Input for this step is GCS bucket containing re-builted PDF files(which have only tables) +* **project_id**: GCP project ID +* **project_num**: GCP project Number +* **location**: Processor location `us` or `eu` +* **cde_processor_id**: CDE processor ID to call batch process +* **gcs_input_uri**: GCS folder which contains input pdf files(files with only specific-use-case tables) +* **input_mime_type**: Mime type of input files which is `application/pdf` here +* **gcs_output_bucket_uri**: GCS output bucket uri without trailing slash +* **gcs_output_uri_prefix**: GCS output folder path to store results +* **field_mask**: To store specific keys of document proto (entities,pages.pageNumber) +* **timeout**: to wait for batch process LRO operation to complete +* **gcs_cde_hitl_output_prefix**: GCS folder which stored HITL output results fro CDE prpcessor +* **line_enhancement_vertical_offset**: Offset used to adjust the placement of the vertical lines, it can be tuned based on the layout +* **line_enhancement_horizontal_offset**: Offset used to adjust the placement of the horizontal lines, it can be tuned based on the layout +* **flow**: for this notebook file flow is `line_enhancement_basic` +* **fp_processor_id**: FP Processor ID to call batch process +* **fp_processor_v1**: FP version ID to call batch process +* **gcs_line_enhance_output_prefix**: GCS prefix to store Line enhancement results +* **gcs_fpoutput_uri_prefix**: GCS prefix to store FP results + +Sample image after training CDE processor for row columns & header columns + + + + + +
CDE Samplecde_train_sample.png
+ +## Output Details : Table Sample from pdf file +line_enhancement_basic_table_img.png + +* 2.Table Parsing using CDE Headers and Form Parser : This tool built using Python programming language. It converts tables present in pdf to csv files and stores them in GCS bucket without enhancing input pdf files by parsing them through Form Parser & CDE results. + +## Input Details for this tool : Input for this step is GCS bucket containing PDF files(which has only your specific-use-case tables) + +* **project_id**: GCP project ID +* **project_num**: GCP project Number +* **location**: Processor location `us` or `eu` +* **cde_processor_id**: CDE processor ID to call batch process +* **gcs_input_uri**: GCS folder which contains input pdf files(files with only specific-use-case tables) +* **input_mime_type**: Mime type of input files which is `application/pdf` here +* **gcs_output_bucket_uri**: GCS output bucket uri without trailing slash +* **gcs_cde_output_uri_prefix**: GCS output folder path to store CDE results +* **gcs_fp_output_uri_prefix**: GCS output folder path to store FP results +* **gcs_cde_fp_output_uri_prefix**: GCS prefix to store ocr walk final output results +* **field_mask**: To store specific keys of document proto (entities,pages.pageNumber) +* **timeout**: to wait for batch process LRO operation to complete +* **flow**: for this notebook file flow is `ocr_walk` +* **fp_processor_id**: FP Processor ID to call batch process + +Sample image after training CDE processor for row columns & header columns + + + + + +
CDE Samplecde_train_sample.png
+ +## Output Details : One of the table Sample from pdf file +ocr_walk_input_sample.png + +* 3.FP_tables_to_csv : This tool built using Python programming language. It converts tables present in pdf to csv files and stores them in GCS bucket by using Form Parser results. + +## Input Details : fp_tables_to_csv +* **project_id**: GCP project ID +* **location**: Processor location `us` or `eu` +* **fp_processor_id**: FP Processor ID to call batch process +* **gcs_input_uri**: GCS folder which contains input pdf files(files with only specific-use-case tables) +* **input_mime_type**: Mime type of input files which is `application/pdf` here +* **gcs_output_bucket_uri**: GCS output bucket uri without trailing slash +* **gcs_output_uri_prefix**: GCS output folder path to store results +* **field_mask**: To store specific keys of document proto (entities,pages.pageNumber) +* **timeout**: to wait for batch process LRO operation to complete +* **fp_processor_v**: FP version(V1 or V2) ID to call batch process + +## Output Details : output sample for one-table which stored as csv files in GCS bucket +fp_tables_to_csv_output_csv_sample.png + +* 4. Table Extraction with Line Enhancement : This tool built using Python programming language. It converts tables present in pdf to csv files and stores them in GCS bucket by enhancing input pdf files & parsing them through both Form Parser & CDE results. + +## Input Details for Table Extraction with Line Enhancement +* **project_id**: GCP project ID +* **project_num**: GCP project Number +* **location**: Processor location `us` or `eu` +* **cde_processor_id**: CDE processor ID to call batch process +* **gcs_input_uri**: GCS folder which contains input pdf files(files with only specific-use-case tables) +* **input_mime_type**: Mime type of input files which is `application/pdf` here +* **gcs_output_bucket_uri**: GCS output bucket uri without trailing slash +* **gcs_output_uri_prefix**: GCS output folder path to store results +* **field_mask**: To store specific keys of document proto (entities,pages.pageNumber) +* **timeout**: to wait for batch process LRO operation to complete +* **gcs_cde_hitl_output_prefix**: GCS folder which stored HITL output results fro CDE prpcessor +* **line_enhancement_vertical_offset**: Offset used to adjust the placement of the vertical lines, it can be tuned based on the layout +* **line_enhancement_horizontal_offset**: Offset used to adjust the placement of the horizontal lines, it can be tuned based on the layout +* **flow**: for this notebook file flow is `line_enhancement_basic` +* **fp_processor_id**: FP Processor ID to call batch process +* **fp_processor_v2**: FP version2 ID to call batch process +* **gcs_line_enhance_output_prefix**: GCS prefix to store Line enhancement results +* **gcs_fpoutput_uri_prefix**: GCS prefix to store FP results + +## Output Details + +Table sample of pdf file after running **enhance_save_pdfs** function +after_line_enhancement_sample.png + +output sample for one-table which stored as csv files in GCS bucket +line_enhancement_basic_table_img + +* 5. Table Spanning Page Merge Script : DocumentAI Page Merger is a tool built using Python programming language. Its purpose is to provide technique for merging table(Specific use case tables) which spans across two pages. This document highlights the working of the tool(script) and its requirements. + +## Input Details for Table Spanning Page Merge Script +* **PROJECT_ID** : Provide your GCP Project ID +* **LOCATION** : Provide the location of processor like `us` or `eu` +* **PROCESSOR_ID** : Provide ID of CDE processor +* **FOLDER_PATH** : Folder which hold input pdf files(pdf pages should be having only use-case table pages) +* **OUTPUT_FOLDER** : Set your output folder path where the merged pdfs should be stored in your local system +* **MIME_TYPE** : Provide mime type of input documents +* **COL_HEADERS** : Provide list of all entities(entity type) which are annotated in CDE processor to identify *column headers* +* **ROW_HEADERS** : Provide list of all entities(entity type) which are annotated in CDE processor to identify *row headers* + +## Output Details for Page Merger Script +## Input file have table across two pages + + + + + +
page_merger_input_1.pngpage_merger_input_2.png
+ +## After running page_merger script you can find table in single page + + + + + + +
page_merger_output.png
diff --git a/incubator-tools/advance_table_line_enhancement/Table_Parsing_using_CDE_Headers_and_Form_parser.ipynb b/incubator-tools/advance_table_line_enhancement/Table_Parsing_using_CDE_Headers_and_Form_parser.ipynb new file mode 100644 index 000000000..29894a0f1 --- /dev/null +++ b/incubator-tools/advance_table_line_enhancement/Table_Parsing_using_CDE_Headers_and_Form_parser.ipynb @@ -0,0 +1,401 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "79c0a10c-1269-4a48-b4c5-ef4bbbdd2644", + "metadata": {}, + "source": [ + "# Table Parsing using Custom CDE Headers and Form parser" + ] + }, + { + "cell_type": "markdown", + "id": "0f01fe57-ffc0-402b-81ac-5f77ebd77ece", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "7fa07f2d-5087-4d8d-9ceb-5d46a21e8926", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "a2c6bbc7-49af-4a2d-98b8-09b9b8739e57", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "The purpose of this notebook is to convert tables found in PDF documents into CSV files, which are then stored in a GCS bucket. It utilizes the headers from the tables extracted by the CDE parser, along with the table output from the Form Parser, to generate the desired table result, which is subsequently saved in CSV format. Notably, this workflow does not involve any table enhancement to the input PDF files." + ] + }, + { + "cell_type": "markdown", + "id": "107ff6ab-a7a9-4e31-b00f-9e7706fca09f", + "metadata": {}, + "source": [ + "# Pre-requisites" + ] + }, + { + "cell_type": "markdown", + "id": "05b0a705-8144-40d4-a92a-15488bd4d1bf", + "metadata": {}, + "source": [ + "This tool requires the following services:\n", + "\n", + " * Vertex AI Notebook instance\n", + " * Access to Document AI CDE & Form Parser Processor\n", + " * GCS Bucket for storage purpose\n", + " \n", + "Google Jupyter Notebook is used for running the python notebook file. Cloud Storage Buckets is needed to store and pass input files to this script & to store results." + ] + }, + { + "cell_type": "markdown", + "id": "114a1ed5-30fc-4b57-9f4a-3c84a73b3591", + "metadata": {}, + "source": [ + "CDE for Headers, Create a Custom Document Extractor(CDE) Processor & Configure HITL to review poor performing documents. Train your CDE as per your use-case table by annotating **row headers** & **column headers** for specific use-case-table\n", + "* Input for this step is GCS bucket containing PDF files(which has only your specific-use-case tables), now run `batch_process_documents`\n", + "* Output JSON files will be store GCS bucket \n", + "\n", + "Sample image after training CDE processor for row columns & header columns\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
CDE Sample
\n", + "Here are sample row headers and column headers which we followed while training CDE for our specific use-case table \n", + "\n", + "**column headers** are as follow a[\"SCC\", \"DNSH\", \"DNSH_P\", \"code\", \"business_measure\", \"DNSH_BE\", \"DNSH_CCA\", \"DNSH_CCM\", \"DNSH_CE\", \"DNSH_WMR\", \"min_safeguards\", \"proportion_of_bm\", \"SCC_BE\", \"SCC_CCA\", \"SCC_CCM\", \"SCC_CE\", \"SCC_P\", \"SCC_WMR\"] and **row headers** are as follow [\"taxonomy_disclosure\", \"activity\"]" + ] + }, + { + "cell_type": "markdown", + "id": "53acbcf2-499b-481d-a64d-f2f334554577", + "metadata": {}, + "source": [ + "# Script" + ] + }, + { + "cell_type": "markdown", + "id": "2ad3f0db-8f32-4fec-b480-a17cd2b39967", + "metadata": {}, + "source": [ + "# 1. Import Modules/Packages" + ] + }, + { + "cell_type": "markdown", + "id": "3c8cf73b-4a79-4c6e-b214-1aef4c16240a", + "metadata": {}, + "source": [ + "**Note** : Please download the **tool_helper_functions.py** Python file before proceeding to further steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a789a344-ca2e-49a6-b8ce-e7c9b1a0d673", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "from tool_helper_functions import (\n", + " batch_process_documents,\n", + " get_processor_metadata,\n", + " poll_hitl_operations,\n", + " walk_the_ocr,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f60bfb4d-f9e2-48de-894a-f0c736ba45e6", + "metadata": {}, + "source": [ + "# 2. Input Details : Configure below Input variables" + ] + }, + { + "cell_type": "markdown", + "id": "6b531f5c-b126-4074-8b4c-3b425f9b1513", + "metadata": {}, + "source": [ + "* **project_id**: GCP project ID\n", + "* **project_num**: GCP project Number\n", + "* **location**: Processor location `us` or `eu`\n", + "* **cde_processor_id**: CDE processor ID to call batch process\n", + "* **gcs_input_uri**: GCS folder which contains input pdf files(files with only specific-use-case tables)\n", + "* **input_mime_type**: Mime type of input files which is `application/pdf` here\n", + "* **gcs_output_bucket_uri**: GCS output bucket uri without trailing slash\n", + "* **gcs_cde_output_uri_prefix**: GCS output folder path to store CDE results\n", + "* **gcs_fp_output_uri_prefix**: GCS output folder path to store FP results\n", + "* **gcs_cde_fp_output_uri_prefix**: GCS prefix to store ocr walk final output results\n", + "* **field_mask**: To store specific keys of document proto (entities,pages.pageNumber)\n", + "* **timeout**: to wait for batch process LRO operation to complete\n", + "* **flow**: for this notebook file flow is `ocr_walk`\n", + "* **fp_processor_id**: FP Processor ID to call batch process" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0d6c085-a232-480e-ba7d-7ac00d47e185", + "metadata": {}, + "outputs": [], + "source": [ + "datetime_suffix = \"{date:%Y-%m-%d_%H:%M:%S}\".format(date=datetime.datetime.now())\n", + "project_id = \"\"\n", + "project_num = \"\"\n", + "location = \"\" # us or eu\n", + "cde_processor_id = \"\"\n", + "gcs_input_uri = f\"gs://bucket_name/prefix/to_input/{datetime_suffix}\"\n", + "input_mime_type = \"\" # \"application/pdf\"\n", + "gcs_output_bucket_uri = \"gs://bucket_name\"\n", + "gcs_cde_output_uri_prefix = f\"cde_output/prefix/{datetime_suffix}\"\n", + "gcs_fp_output_uri_prefix = f\"fp_output/prefix/{datetime_suffix}\"\n", + "gcs_cde_fp_output_uri_prefix = f\"cde_fp_output/prefix/{datetime_suffix}\"\n", + "field_mask = None\n", + "timeout = 5000\n", + "flow = \"ocr_walk\"\n", + "fp_processor_id = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "bc17cd73-dc06-44c9-9e1c-95b91224b077", + "metadata": {}, + "source": [ + "# 3. Run below code" + ] + }, + { + "cell_type": "markdown", + "id": "070b4fdc-86ca-4fb9-acc7-8882d5e55f02", + "metadata": {}, + "source": [ + "Now call `batch_process_documents` function to process all files in input folder(each file contains specific-use-case table only), it results metadata & operation_id of batch process(Long Running Operation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a08d85f9-e1eb-4897-a1e8-e8e0abcacdd2", + "metadata": {}, + "outputs": [], + "source": [ + "cde_metadata, cde_operation = batch_process_documents(\n", + " project_id,\n", + " location,\n", + " cde_processor_id,\n", + " gcs_input_uri,\n", + " input_mime_type,\n", + " gcs_output_bucket_uri,\n", + " gcs_cde_output_uri_prefix,\n", + " field_mask,\n", + " timeout,\n", + ")\n", + "print(\"CDE batch process completed\")" + ] + }, + { + "cell_type": "markdown", + "id": "46c39adc-e0bf-4329-a77c-b88eb083f4ce", + "metadata": {}, + "source": [ + "Now use `get_processor_metadata` function from utils module, it takes batch process metsdata as input and results key-value pairs of filenames & it's prefix and hitl operation-id(if input files triggers hitl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2a8006e-2ab7-4644-9529-ef141c7d16ba", + "metadata": {}, + "outputs": [], + "source": [ + "cde_input_output_map = get_processor_metadata(cde_metadata)\n", + "# cde_input_output_map variable data as below\n", + "# {'03_Non-Financial_Corporate_Report_2022_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/0', 'hitl': '12795457638097959002'}, '1962771_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/1', 'hitl': '11860520012484438543'}, '2022_VGT_Group Annual Report_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/5', 'hitl': '2523802694474965110'}, 'DE0007030009-JA-2022-EQ-E-00-pg144_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/3', 'hitl': '14342450698739476592'}, 'DE0007030009-JA-2022-EQ-E-00_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/4', 'hitl': '17242897657994716395'}, 'DE000STRA555-JA-2022-EQ-E-00_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/2', 'hitl': '2909143051612169782'}}" + ] + }, + { + "cell_type": "markdown", + "id": "4b515162-d9a8-4674-8935-4d04165f808c", + "metadata": {}, + "source": [ + "`poll_hitl_operations` is a waiting function to check & resolve HITL triggered documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78897271-f857-41c9-9b2d-217b85b29d33", + "metadata": {}, + "outputs": [], + "source": [ + "poll_hitl_operations(project_num, location, cde_input_output_map)" + ] + }, + { + "cell_type": "markdown", + "id": "f600796f-b260-4850-ae73-ed784d382551", + "metadata": {}, + "source": [ + "Now call `batch_process_documents` function to process all files in input folder(each file contains specific-use-case table only), it results metadata & operation_id of batch process(Long Running Operation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f379570-ce6a-4b8e-b791-cfceb7f2d483", + "metadata": {}, + "outputs": [], + "source": [ + "fp_metadata, fp_operation = batch_process_documents(\n", + " project_id,\n", + " location,\n", + " fp_processor_id,\n", + " gcs_input_uri,\n", + " input_mime_type,\n", + " gcs_output_bucket_uri,\n", + " gcs_fp_output_uri_prefix,\n", + " field_mask,\n", + " timeout,\n", + " fp_processor_v2,\n", + ")\n", + "print(\"FP batch process completed\")" + ] + }, + { + "cell_type": "markdown", + "id": "256b915c-6610-4592-96d9-e643c244523f", + "metadata": {}, + "source": [ + "Now use `get_processor_metadata` function from utils module, it takes batch process metsdata as input and results key-value pairs of filenames & it's prefix and hitl operation-id(if input files triggers hitl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d237c658-7021-4613-8f24-c3570c60c949", + "metadata": {}, + "outputs": [], + "source": [ + "fp_input_output_map = get_processor_metadata(fp_metadata, fp=True)\n", + "# fp_input_output_map sample as below\n", + "# {'03_Non-Financial_Corporate_Report_2022_extracted.pdf': 'msci/TESTING/fp_output/2023-11-02_18:25:31/10273358736471385291/0', '1962771_extracted.pdf': 'msci/TESTING/fp_output/2023-11-02_18:25:31/10273358736471385291/1','2022_VGT_Group Annual Report_extracted.pdf': 'msci/TESTING/fp_output/2023-11-02_18:25:31/10273358736471385291/4','DE0007030009-JA-2022-EQ-E-00-pg144_extracted.pdf': 'msci/TESTING/fp_output/2023-11-02_18:25:31/10273358736471385291/5','DE0007030009-JA-2022-EQ-E-00_extracted.pdf': 'msci/TESTING/fp_output/2023-11-02_18:25:31/10273358736471385291/2','DE000STRA555-JA-2022-EQ-E-00_extracted.pdf': 'msci/TESTING/fp_output/2023-11-02_18:25:31/10273358736471385291/3'}" + ] + }, + { + "cell_type": "markdown", + "id": "08ba28f7-c87c-447e-8bd6-ae91c303ed65", + "metadata": {}, + "source": [ + "`poll_hitl_operations` is a waiting function to check & resolve HITL triggered documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80982dee-0f66-43a2-a0ad-e5c2222771a2", + "metadata": {}, + "outputs": [], + "source": [ + "poll_hitl_operations(project_num, location, cde_input_output_map)" + ] + }, + { + "cell_type": "markdown", + "id": "e49867d1-4451-43e8-b0c7-1665821d9eb6", + "metadata": {}, + "source": [ + "`walk_the_ocr` function uses CDE and FP json output and parse it to get final output for both row headers & column headers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6810abf-b33f-4501-8f8a-d81aaca01d63", + "metadata": {}, + "outputs": [], + "source": [ + "walk_the_ocr(\n", + " project_id,\n", + " location,\n", + " cde_input_output_map,\n", + " gcs_output_bucket,\n", + " gcs_cde_hitl_output_prefix,\n", + " fp_input_output_map,\n", + " f\"{gcs_output_uri_prefix}/{flow}/{datetime_suffix}\",\n", + " offset,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "589ae870-f711-4736-ab1e-0b08fa189e74", + "metadata": { + "tags": [] + }, + "source": [ + "# Output Samples" + ] + }, + { + "cell_type": "markdown", + "id": "596f22c7-e68a-4f34-8ce9-e7b4822d31f2", + "metadata": {}, + "source": [ + "One of the table Sample from pdf file\n", + "![](./Images/ocr_walk_input_sample.png)" + ] + }, + { + "cell_type": "markdown", + "id": "9ac67de3-7306-4e41-b046-7021ae086d79", + "metadata": {}, + "source": [ + "output sample for one-table which stored as csv files in GCS bucket\n", + "![](./Images/ocr_walk_output_sample.png)" + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/advance_table_line_enhancement/Table_Spanning_Page_Merge_Script.ipynb b/incubator-tools/advance_table_line_enhancement/Table_Spanning_Page_Merge_Script.ipynb new file mode 100644 index 000000000..03fd53736 --- /dev/null +++ b/incubator-tools/advance_table_line_enhancement/Table_Spanning_Page_Merge_Script.ipynb @@ -0,0 +1,631 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cc34f8f6-9b81-48d5-b317-7da269c078d6", + "metadata": { + "id": "cc34f8f6-9b81-48d5-b317-7da269c078d6" + }, + "source": [ + "# DocumentAI Merge Specific-Use-Case Table(Table Across Two Pages) Script" + ] + }, + { + "cell_type": "markdown", + "id": "b85f7bae-ae78-4b22-bb23-98badf204211", + "metadata": { + "id": "b85f7bae-ae78-4b22-bb23-98badf204211" + }, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "29a7ab42-e144-40ce-a258-d65455092b8d", + "metadata": { + "id": "29a7ab42-e144-40ce-a258-d65455092b8d" + }, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "04591d9d-9ad3-483f-ab45-3d4e0b7570e9", + "metadata": { + "id": "04591d9d-9ad3-483f-ab45-3d4e0b7570e9", + "tags": [] + }, + "source": [ + "## Objective\n", + "\n", + "DocumentAI Page Merger is a tool built using Python programming language. Its purpose is to provide technique for merging table(**Specific use case tables**) which spans across two pages. This document highlights the working of the tool(script) and its requirements." + ] + }, + { + "cell_type": "markdown", + "id": "290ab039-258e-4761-b277-6d81db637ae0", + "metadata": { + "id": "290ab039-258e-4761-b277-6d81db637ae0" + }, + "source": [ + "**NOTE**:\n", + "* Input pdf files contains only use-case table which spans across two pages.\n", + "* You need to train CDE processor for *specific use-case table* by annotating `row_header` and `column_header` entities. These headers are needed to run this page merger script." + ] + }, + { + "cell_type": "markdown", + "id": "3e185f3a-333b-4dfa-bc50-5febbd27444b", + "metadata": { + "id": "3e185f3a-333b-4dfa-bc50-5febbd27444b" + }, + "source": [ + "This tool requires the following services:\n", + "\n", + " * Vertex AI Notebook instance\n", + " * Access to Document AI CDE Processor\n", + " * Folder containing input PDFs\n", + "\n", + "Google Jupyter Notebook is used for running the python notebook file. Input folder should have the input files to this script. CDE processor to train a model which detects row headers and column headers for your specific usecase table by annotating row headers and column headers." + ] + }, + { + "cell_type": "markdown", + "id": "yEPx3vNjUn18", + "metadata": { + "id": "yEPx3vNjUn18" + }, + "source": [ + "## Approach \n", + "* Using the CDE processor output identifies pairs of consecutive pages where a table starts on the first page (with row headers) and continues on the next page (without row headers but with column headers), indicating a split table across those pages.\n", + "* Using Pillow, identified excess regions with no white pixels on the PDFs, and cropped the white space on the right of the first page and the left side of the second page for each identified pair.\n", + "* Horizontally merged the cropped pages to create seamless and complete tables." + ] + }, + { + "cell_type": "markdown", + "id": "56a7f1ce-3cf2-472a-a42a-aae0c1d3c387", + "metadata": { + "id": "56a7f1ce-3cf2-472a-a42a-aae0c1d3c387" + }, + "source": [ + "CDE for Headers, Create a Custom Document Extractor(CDE) Processor & Configure HITL to review poor performing documents. Train your CDE as per your use-case table by annotating **row headers** & **column headers** for specific use-case-table\n", + "* Input for this step is GCS bucket containing re-builted PDF files (which are output from step-2a advance_table_parser.ipynb), now run `batch_process_documents`\n", + "* Output JSON files will be store GCS bucket" + ] + }, + { + "cell_type": "markdown", + "id": "478d7676-5d9d-4096-9d11-6d4b64fc47cd", + "metadata": { + "id": "478d7676-5d9d-4096-9d11-6d4b64fc47cd" + }, + "source": [ + "Sample image after training CDE processor for row columns & header columns\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
CDE Sample
\n", + "Here are sample row headers and column headers which we followed while training CDE for our specific use-case table \n", + "\n", + "**column headers** are as follow a[\"SCC\", \"DNSH\", \"DNSH_P\", \"code\", \"business_measure\", \"DNSH_BE\", \"DNSH_CCA\", \"DNSH_CCM\", \"DNSH_CE\", \"DNSH_WMR\", \"min_safeguards\", \"proportion_of_bm\", \"SCC_BE\", \"SCC_CCA\", \"SCC_CCM\", \"SCC_CE\", \"SCC_P\", \"SCC_WMR\"] and **row headers** are as follow [\"taxonomy_disclosure\", \"activity\"]" + ] + }, + { + "cell_type": "markdown", + "id": "7fdfb175-3c18-48ed-afbb-3a81b889d936", + "metadata": { + "id": "7fdfb175-3c18-48ed-afbb-3a81b889d936" + }, + "source": [ + "# Script" + ] + }, + { + "cell_type": "markdown", + "id": "b9d2888a-633d-48dc-be50-156df01f7eba", + "metadata": { + "id": "b9d2888a-633d-48dc-be50-156df01f7eba" + }, + "source": [ + "## 1. Import Modules/Packages" + ] + }, + { + "cell_type": "markdown", + "id": "5993b837-8055-485d-8c73-86084d1b3480", + "metadata": {}, + "source": [ + "**Note** : Please download the **tool_helper_functions.py** Python file before proceeding to further steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2827a0a-8028-4fa3-b532-793fd97babc8", + "metadata": { + "id": "a2827a0a-8028-4fa3-b532-793fd97babc8" + }, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "from typing import List, Tuple, Union\n", + "\n", + "import img2pdf\n", + "import pandas as pd\n", + "from google.cloud import documentai_v1 as documentai\n", + "from pdf2image import convert_from_path\n", + "from PIL import Image, PpmImagePlugin\n", + "from PyPDF2 import PdfMerger" + ] + }, + { + "cell_type": "markdown", + "id": "d339050c-eed6-497e-9eb4-4b9bdddcea9d", + "metadata": { + "id": "d339050c-eed6-497e-9eb4-4b9bdddcea9d" + }, + "source": [ + "## 2. Input Details : Configure below Input variables" + ] + }, + { + "cell_type": "markdown", + "id": "ea8b8e24-398d-4e9b-8a36-33cde0118a6e", + "metadata": { + "id": "ea8b8e24-398d-4e9b-8a36-33cde0118a6e" + }, + "source": [ + "* **PROJECT_ID** : Provide your GCP Project ID\n", + "* **LOCATION** : Provide the location of processor like `us` or `eu`\n", + "* **PROCESSOR_ID** : Provide ID of CDE processor\n", + "* **FOLDER_PATH** : Folder which hold input pdf files(pdf pages should be having only use-case table pages)\n", + "* **OUTPUT_FOLDER** : Set your output folder path where the merged pdfs should be stored in your local system\n", + "* **MIME_TYPE** : Provide mime type of input documents\n", + "* **COL_HEADERS** : Provide list of all entities(entity type) which are annotated in CDE processor to identify *column headers*\n", + "* **ROW_HEADERS** : Provide list of all entities(entity type) which are annotated in CDE processor to identify *row headers*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1164e220-9b94-4a22-bb91-d92349684bc8", + "metadata": { + "id": "1164e220-9b94-4a22-bb91-d92349684bc8" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"\"\n", + "LOCATION = \"\"\n", + "PROCESSOR_ID = \"\"\n", + "FOLDER_PATH = \"dir_path/to/input_folder/\"\n", + "OUTPUT_FOLDER = \"output_dir/path/\"\n", + "MIME_TYPE = \"application/pdf\"\n", + "# replace COL_HEADERS & ROW_HEADERS with your list of annotation_types\n", + "COL_HEADERS = [\n", + " \"SCC\",\n", + " \"DNSH\",\n", + " \"DNSH_P\",\n", + " \"code\",\n", + " \"business_measure\",\n", + " \"DNSH_BE\",\n", + " \"DNSH_CCA\",\n", + " \"DNSH_CCM\",\n", + " \"DNSH_CE\",\n", + " \"DNSH_WMR\",\n", + " \"min_safeguards\",\n", + " \"proportion_of_bm\",\n", + " \"SCC_BE\",\n", + " \"SCC_CCA\",\n", + " \"SCC_CCM\",\n", + " \"SCC_CE\",\n", + " \"SCC_P\",\n", + " \"SCC_WMR\",\n", + "]\n", + "ROW_HEADERS = [\"taxonomy_disclosure\", \"activity\"]" + ] + }, + { + "cell_type": "markdown", + "id": "d557a339-3bce-43a4-888f-deb8281a84f0", + "metadata": { + "id": "d557a339-3bce-43a4-888f-deb8281a84f0" + }, + "source": [ + "Below image shows, after annotating row headers & column headers for CDE\n", + "![](./Images/cde_train_sample.png)" + ] + }, + { + "cell_type": "markdown", + "id": "d239dcb8-f71f-4bbe-af9a-efa748dd6811", + "metadata": { + "id": "d239dcb8-f71f-4bbe-af9a-efa748dd6811" + }, + "source": [ + "## 3. Run the below code.\n", + "\n", + "Use the below code and Run all the cells (Update the Path parameter if it is not available in the current working directory)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e18f7309-60ee-4ecc-9fc7-7ebb9cf9f47b", + "metadata": { + "id": "e18f7309-60ee-4ecc-9fc7-7ebb9cf9f47b" + }, + "outputs": [], + "source": [ + "def online_process(\n", + " project_id: str,\n", + " location: str,\n", + " processor_id: str,\n", + " file_path: str,\n", + " mime_type: str,\n", + ") -> documentai.Document:\n", + " \"\"\"\n", + " Processes a document using the Document AI Online Processing API.\n", + " \"\"\"\n", + "\n", + " opts = {\"api_endpoint\": f\"{location}-documentai.googleapis.com\"}\n", + "\n", + " # Instantiates a client\n", + " documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)\n", + "\n", + " # The full resource name of the processor, e.g.:\n", + " # projects/project-id/locations/location/processor/processor-id\n", + " # You must create new processors in the Cloud Console first\n", + " resource_name = documentai_client.processor_path(project_id, location, processor_id)\n", + "\n", + " # Read the file into memory\n", + " with open(file_path, \"rb\") as file:\n", + " file_content = file.read()\n", + "\n", + " # Load Binary Data into Document AI RawDocument Object\n", + " raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)\n", + "\n", + " # Configure the process request\n", + " request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)\n", + " print(f\"\\tOnline Document Process started..\")\n", + " # Use the Document AI client to process the sample form\n", + " result = documentai_client.process_document(request=request)\n", + " print(\"\\tSuccessfully document process completed\")\n", + " return result.document" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "706ca789-de08-4424-a51b-056cae57bb1d", + "metadata": { + "id": "706ca789-de08-4424-a51b-056cae57bb1d" + }, + "outputs": [], + "source": [ + "def crop_right(\n", + " img: Union[Image.Image, PpmImagePlugin.PpmImageFile]\n", + ") -> Union[Image.Image, PpmImagePlugin.PpmImageFile]:\n", + " \"\"\"\n", + " Function to crop right side of the image\n", + " \"\"\"\n", + "\n", + " img_data = img.getdata()\n", + " non_empty_columns = [\n", + " i\n", + " for i in range(img.width)\n", + " if not all(\n", + " img_data[i + j * img.width][:3] == (255, 255, 255)\n", + " for j in range(img.height)\n", + " )\n", + " ]\n", + " left = 0\n", + " right = max(non_empty_columns)\n", + " img_cropped = img.crop((left, 0, right, img.height))\n", + " return img_cropped\n", + "\n", + "\n", + "def crop_left(\n", + " img: Union[Image.Image, PpmImagePlugin.PpmImageFile]\n", + ") -> Union[Image.Image, PpmImagePlugin.PpmImageFile]:\n", + " \"\"\"\n", + " Function to crop left side of the image\n", + " \"\"\"\n", + "\n", + " img_data = img.getdata()\n", + " non_empty_columns = [\n", + " i\n", + " for i in range(img.width)\n", + " if not all(\n", + " img_data[i + j * img.width][:3] == (255, 255, 255)\n", + " for j in range(img.height)\n", + " )\n", + " ]\n", + " left = min(non_empty_columns)\n", + " right = img.width\n", + " img_cropped = img.crop((left, 0, right, img.height))\n", + " return img_cropped\n", + "\n", + "\n", + "def process_pdf(\n", + " pdf_path: str, page_pairs: List[Tuple[int, int]], OUTPUT_FOLDER: str\n", + ") -> None:\n", + " \"\"\"\n", + " This function processes pages of complex-table which spans across two pages\n", + " \"\"\"\n", + "\n", + " pdf_merger = PdfMerger()\n", + " for pair in page_pairs:\n", + " images = convert_from_path(\n", + " pdf_path, first_page=pair[0] + 1, last_page=pair[1] + 1\n", + " ) # incrementing page numbers\n", + " if len(images) != 2:\n", + " print(\"More than 2 pages, skipping..\")\n", + " continue\n", + " img1, img2 = images\n", + " img1_cropped = crop_right(img1)\n", + " img2_cropped = crop_left(img2)\n", + "\n", + " # Merge horizontally\n", + " total_width = img1_cropped.width + img2_cropped.width\n", + " max_height = max(img1_cropped.height, img2_cropped.height)\n", + "\n", + " new_img = Image.new(\"RGB\", (total_width, max_height), (255, 255, 255))\n", + " new_img.paste(img1_cropped, (0, 0))\n", + " new_img.paste(img2_cropped, (img1_cropped.width, 0))\n", + "\n", + " # Save as temporary image file\n", + " temp_img_path = \"temp_merged.png\"\n", + " new_img.save(temp_img_path, \"PNG\")\n", + "\n", + " # Convert to PDF\n", + " with open(temp_img_path, \"rb\") as f:\n", + " pdf_bytes = img2pdf.convert(f.read())\n", + "\n", + " temp_pdf_path = \"temp_merged.pdf\"\n", + " with open(temp_pdf_path, \"wb\") as f:\n", + " f.write(pdf_bytes)\n", + "\n", + " pdf_merger.append(temp_pdf_path)\n", + "\n", + " # Remove temporary files\n", + " os.remove(temp_img_path)\n", + " os.remove(temp_pdf_path)\n", + "\n", + " # Naming the output based on the input file\n", + " output_name = os.path.join(\n", + " OUTPUT_FOLDER, os.path.basename(pdf_path).replace(\".pdf\", \"_merged.pdf\")\n", + " )\n", + " pdf_merger.write(output_name)\n", + " pdf_merger.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bf1c71a-89a3-47e2-906f-2d409c08318a", + "metadata": { + "id": "8bf1c71a-89a3-47e2-906f-2d409c08318a" + }, + "outputs": [], + "source": [ + "def get_entities(document: documentai.Document) -> Tuple[List[str], List[int]]:\n", + " \"\"\"\n", + " It will be used to return all entities data and its corresponding page-number of Document object\n", + " \"\"\"\n", + "\n", + " types = []\n", + " page_no = []\n", + " for entity in document.entities:\n", + " types.append(entity.type_)\n", + " page_no.append(entity.page_anchor.page_refs[0].page)\n", + " for prop in entity.properties:\n", + " types.append(prop.type_)\n", + " page_no.append(entity.page_anchor.page_refs[0].page)\n", + " return types, page_no\n", + "\n", + "\n", + "def page_merger() -> None:\n", + " \"\"\"\n", + " Entry function to start page merging process\n", + " \"\"\"\n", + "\n", + " print(\"Page Merger Pileline started\")\n", + " pdfs_and_pages = {}\n", + " for filename in os.listdir(FOLDER_PATH):\n", + " if not filename.endswith(\".pdf\"):\n", + " continue\n", + "\n", + " file_path = os.path.join(FOLDER_PATH, filename)\n", + " print(\n", + " \"Processing \",\n", + " filename,\n", + " )\n", + " # processing for each PDF file\n", + " document = online_process(\n", + " PROJECT_ID, LOCATION, PROCESSOR_ID, file_path, MIME_TYPE\n", + " )\n", + " types, page_no = get_entities(document)\n", + " df = pd.DataFrame(\n", + " {\n", + " \"Type\": types,\n", + " \"Page No.\": page_no,\n", + " }\n", + " )\n", + " df_sorted = df.sort_values(by=\"Page No.\")\n", + " unique_pages = df_sorted[\"Page No.\"].unique()\n", + " col_headers = COL_HEADERS\n", + " row_headers = ROW_HEADERS\n", + " page_to_headers = {}\n", + " for page in unique_pages:\n", + " page_to_headers[page] = set(\n", + " df_sorted[df_sorted[\"Page No.\"] == page][\"Type\"].tolist()\n", + " )\n", + "\n", + " split_pages = []\n", + " for i in range(len(unique_pages) - 1):\n", + " current_page, next_page = unique_pages[i], unique_pages[i + 1]\n", + " current_headers, next_headers = (\n", + " page_to_headers[current_page],\n", + " page_to_headers[next_page],\n", + " )\n", + " if all(row in current_headers for row in row_headers) and not any(\n", + " row in next_headers for row in row_headers\n", + " ):\n", + " if any(header in next_headers for header in col_headers):\n", + " split_pages.append((current_page, next_page))\n", + " if len(split_pages) > 0:\n", + " pdfs_and_pages[filename] = split_pages\n", + " print(f\"\\t\\tDetected split pages in {filename}, pages are -{split_pages}\")\n", + "\n", + " for pdf_file, pages in pdfs_and_pages.items():\n", + " print(\"Processed and saved: \", pdf_file)\n", + " full_pdf_path = os.path.join(FOLDER_PATH, pdf_file)\n", + " pathlib.Path(OUTPUT_FOLDER).mkdir(exist_ok=True)\n", + " if pages:\n", + " print(f\"\\tPage splits are - {pages}\")\n", + " process_pdf(full_pdf_path, pages, OUTPUT_FOLDER)\n", + "\n", + " print(\"Page Merger Pipeline successfully completed for all files\")" + ] + }, + { + "cell_type": "markdown", + "id": "f7f2935a-8f83-4129-b899-a508a8de6173", + "metadata": { + "id": "f7f2935a-8f83-4129-b899-a508a8de6173" + }, + "source": [ + "To start Page Merger pipeline execute `page_merger()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ee2d784-2af3-4189-87ee-0796f50d6396", + "metadata": { + "id": "8ee2d784-2af3-4189-87ee-0796f50d6396", + "outputId": "5b554b8c-d4e3-48dc-a45a-953863db55ff" + }, + "outputs": [], + "source": [ + "page_merger()" + ] + }, + { + "cell_type": "markdown", + "id": "344adb11-1299-458b-bd3d-019feff26bfb", + "metadata": { + "id": "344adb11-1299-458b-bd3d-019feff26bfb" + }, + "source": [ + "# 4. Output" + ] + }, + { + "cell_type": "markdown", + "id": "5ea28429-61c6-4f87-9088-2fb0620c982b", + "metadata": { + "id": "5ea28429-61c6-4f87-9088-2fb0620c982b" + }, + "source": [ + "If table span across two pages then it will be processed and appends them as one page side-by-side, if not spans across two pages then that file is skipped." + ] + }, + { + "cell_type": "markdown", + "id": "b27f5f2f-15d2-43ff-a7d8-e3dbc7c8f94d", + "metadata": { + "id": "b27f5f2f-15d2-43ff-a7d8-e3dbc7c8f94d" + }, + "source": [ + "You can find processed files in the given OUTPUT_FOLDER." + ] + }, + { + "cell_type": "markdown", + "id": "b119f5cd-7db1-4d1f-89cc-2904174ebfb2", + "metadata": { + "id": "b119f5cd-7db1-4d1f-89cc-2904174ebfb2" + }, + "source": [ + "### Input file have table across two pages\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "### After running page_merger script you can find table in single page\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "IzD1CrQvUvoe", + "metadata": { + "id": "IzD1CrQvUvoe" + }, + "source": [ + "## Limitations\n", + "* *CDE Prediction Impact*: The accuracy of split table detection relies on the precise CDE predictions for row and column headers. Inaccuracies in predictions could lead to false positives or missed splits, affecting the merging process.\n", + "* *Row Headers Across Both Pages*: If both pages contain headers, the CDE might not correctly differentiate between them.\n", + "* *Table Spanning Multiple Pages*: If a single table spans more than two pages, then the CDE might not detect.\n", + "* *Inconsistent Split Position*: If the split between the first and second page varies in terms of rows or columns alignment.\n", + "* *Single Page Split*: The approach might miss cases where a table is split within a single" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7525df2d-a88f-4764-b06d-d3008bb7eb85", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "environment": { + "kernel": "conda-root-py", + "name": "workbench-notebooks.m113", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/advance_table_line_enhancement/Table_extraction_with_Line_Enhancement.ipynb b/incubator-tools/advance_table_line_enhancement/Table_extraction_with_Line_Enhancement.ipynb new file mode 100644 index 000000000..d9c16fd75 --- /dev/null +++ b/incubator-tools/advance_table_line_enhancement/Table_extraction_with_Line_Enhancement.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "79c0a10c-1269-4a48-b4c5-ef4bbbdd2644", + "metadata": {}, + "source": [ + "# Line enhancement and Table Parsing using CDE and FP" + ] + }, + { + "cell_type": "markdown", + "id": "0f01fe57-ffc0-402b-81ac-5f77ebd77ece", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "7fa07f2d-5087-4d8d-9ceb-5d46a21e8926", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "a2c6bbc7-49af-4a2d-98b8-09b9b8739e57", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This tool built using Python programming language. It converts tables present in pdf to csv files and stores them in GCS bucket by enhancing input pdf files & parsing them through both Form Parser & CDE results." + ] + }, + { + "cell_type": "markdown", + "id": "107ff6ab-a7a9-4e31-b00f-9e7706fca09f", + "metadata": {}, + "source": [ + "# Pre-requisites" + ] + }, + { + "cell_type": "markdown", + "id": "05b0a705-8144-40d4-a92a-15488bd4d1bf", + "metadata": {}, + "source": [ + "This tool requires the following services:\n", + "\n", + " * Vertex AI Notebook instance\n", + " * Access to Document AI Form Parser & CDE Processor\n", + " * GCS Bucket containing input PDFs & to store output results\n", + " \n", + "Google Jupyter Notebook is used for running the python notebook file. Cloud Storage Buckets is needed to store and pass input files to this script & to store results." + ] + }, + { + "cell_type": "markdown", + "id": "efd3c184-5bce-4ebe-bcdd-5eddf5810c88", + "metadata": {}, + "source": [ + "CDE for Headers, Create a Custom Document Extractor(CDE) Processor & Configure HITL to review poor performing documents. Train your CDE as per your use-case table by annotating **row headers** & **column headers** for specific use-case-table\n", + "* Input for this step is GCS bucket containing PDF files(which has only your specific-use-case tables), now run `batch_process_documents`\n", + "* Output JSON files will be store GCS bucket \n", + "\n", + "Sample image after training CDE processor for row columns & header columns\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
CDE Sample
\n", + "Here are sample row headers and column headers which we followed while training CDE for our specific use-case table \n", + "\n", + "**column headers** are as follow a [\"SCC\", \"DNSH\", \"DNSH_P\", \"code\", \"business_measure\", \"DNSH_BE\", \"DNSH_CCA\", \"DNSH_CCM\", \"DNSH_CE\", \"DNSH_WMR\", \"min_safeguards\", \"proportion_of_bm\", \"SCC_BE\", \"SCC_CCA\", \"SCC_CCM\", \"SCC_CE\", \"SCC_P\", \"SCC_WMR\"] and **row headers** are as follow [\"taxonomy_disclosure\", \"activity\"]" + ] + }, + { + "cell_type": "markdown", + "id": "53acbcf2-499b-481d-a64d-f2f334554577", + "metadata": {}, + "source": [ + "# Script" + ] + }, + { + "cell_type": "markdown", + "id": "2ad3f0db-8f32-4fec-b480-a17cd2b39967", + "metadata": {}, + "source": [ + "# 1. Import Modules/Packages" + ] + }, + { + "cell_type": "markdown", + "id": "7fe0b7ca-ee26-4949-bd3e-8a3b412fb750", + "metadata": {}, + "source": [ + "**Note** : Please download the **tool_helper_functions.py** Python file before proceeding to further steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a789a344-ca2e-49a6-b8ce-e7c9b1a0d673", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "from tool_helper_functions import (\n", + " batch_process_documents,\n", + " poll_hitl_operations,\n", + " get_processor_metadata,\n", + " enhance_and_save_pdfs,\n", + " walk_the_ocr,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f60bfb4d-f9e2-48de-894a-f0c736ba45e6", + "metadata": {}, + "source": [ + "# 2. Input Details : Configure below Input variables" + ] + }, + { + "cell_type": "markdown", + "id": "72b880fe-b8f5-466d-b7aa-fe387b7008ad", + "metadata": {}, + "source": [ + "* **project_id**: GCP project ID\n", + "* **project_num**: GCP project Number\n", + "* **location**: Processor location `us` or `eu`\n", + "* **cde_processor_id**: CDE processor ID to call batch process\n", + "* **gcs_input_uri**: GCS folder which contains input pdf files(files with only specific-use-case tables)\n", + "* **input_mime_type**: Mime type of input files which is `application/pdf` here\n", + "* **gcs_output_bucket_uri**: GCS output bucket uri without trailing slash\n", + "* **gcs_output_uri_prefix**: GCS output folder path to store results\n", + "* **field_mask**: To store specific keys of document proto (entities,pages.pageNumber)\n", + "* **timeout**: to wait for batch process LRO operation to complete\n", + "* **gcs_cde_hitl_output_prefix**: GCS folder which stored HITL output results fro CDE prpcessor\n", + "* **line_enhancement_vertical_offset**: Offset used to adjust the placement of the vertical lines, it can be tuned based on the layout\n", + "* **line_enhancement_horizontal_offset**: Offset used to adjust the placement of the horizontal lines, it can be tuned based on the layout\n", + "* **flow**: for this notebook file flow is `line_enhancement_basic`\n", + "* **fp_processor_id**: FP Processor ID to call batch process\n", + "* **fp_processor_v2**: FP version2 ID to call batch process\n", + "* **gcs_line_enhance_output_prefix**: GCS prefix to store Line enhancement results\n", + "* **gcs_fpoutput_uri_prefix**: GCS prefix to store FP results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0d6c085-a232-480e-ba7d-7ac00d47e185", + "metadata": {}, + "outputs": [], + "source": [ + "datetime_suffix = \"{date:%Y-%m-%d_%H:%M:%S}\".format(date=datetime.datetime.now())\n", + "\n", + "project_id = \"\"\n", + "project_num = \"\"\n", + "location = \"\" # us or eu\n", + "cde_processor_id = \"\"\n", + "gcs_input_uri = f\"gs://bucket_name/prefix/to_input/{datetime_suffix}\"\n", + "input_mime_type = \"\" # \"application/pdf\"\n", + "gcs_output_bucket_uri = \"gs://bucket_name\"\n", + "gcs_output_uri_prefix = f\"output_folder/prefix/{datetime_suffix}\"\n", + "field_mask = None # \"entities,pages.pageNumber\"\n", + "timeout = 5000\n", + "gcs_cde_hitl_output_prefix = \"cde-hitl/output/prefix\"\n", + "line_enhancement_vertical_offset = 25\n", + "line_enhancement_horizontal_offset = 5\n", + "\n", + "flow = \"line_enhancement_ocr_walk\"\n", + "fp_processor_id = \"\"\n", + "fp_processor_v2 = \"\n", + " \n", + " CDE Sample\n", + " \n", + " \n", + " \n", + "Here are sample row headers and column headers which we followed while training CDE for our specific use-case table \n", + "\n", + "**column headers** are as follow a[\"SCC\", \"DNSH\", \"DNSH_P\", \"code\", \"business_measure\", \"DNSH_BE\", \"DNSH_CCA\", \"DNSH_CCM\", \"DNSH_CE\", \"DNSH_WMR\", \"min_safeguards\", \"proportion_of_bm\", \"SCC_BE\", \"SCC_CCA\", \"SCC_CCM\", \"SCC_CE\", \"SCC_P\", \"SCC_WMR\"] and **row headers** are as follow [\"taxonomy_disclosure\", \"activity\"]" + ] + }, + { + "cell_type": "markdown", + "id": "53acbcf2-499b-481d-a64d-f2f334554577", + "metadata": {}, + "source": [ + "# Script" + ] + }, + { + "cell_type": "markdown", + "id": "2ad3f0db-8f32-4fec-b480-a17cd2b39967", + "metadata": { + "tags": [] + }, + "source": [ + "# 1. Import Modules/Packages" + ] + }, + { + "cell_type": "markdown", + "id": "92f5f52a-b0d5-4483-afa6-f8c78317c4cd", + "metadata": {}, + "source": [ + "**Note** : Please download the **tool_helper_functions.py** Python file before proceeding to further steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a789a344-ca2e-49a6-b8ce-e7c9b1a0d673", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "from tool_helper_functions import (\n", + " batch_process_documents,\n", + " poll_hitl_operations,\n", + " get_processor_metadata,\n", + " parse_document_tables,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f60bfb4d-f9e2-48de-894a-f0c736ba45e6", + "metadata": {}, + "source": [ + "# 2. Input Details : Configure below Input variables" + ] + }, + { + "cell_type": "markdown", + "id": "72b880fe-b8f5-466d-b7aa-fe387b7008ad", + "metadata": {}, + "source": [ + "* **project_id**: GCP project ID\n", + "* **location**: Processor location `us` or `eu`\n", + "* **fp_processor_id**: FP Processor ID to call batch process\n", + "* **gcs_input_uri**: GCS folder which contains input pdf files(files with only specific-use-case tables)\n", + "* **input_mime_type**: Mime type of input files which is `application/pdf` here\n", + "* **gcs_output_bucket_uri**: GCS output bucket uri without trailing slash\n", + "* **gcs_output_uri_prefix**: GCS output folder path to store results\n", + "* **field_mask**: To store specific keys of document proto (entities,pages.pageNumber)\n", + "* **timeout**: to wait for batch process LRO operation to complete\n", + "* **fp_processor_v**: FP version(V1 or V2) ID to call batch process" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0d6c085-a232-480e-ba7d-7ac00d47e185", + "metadata": {}, + "outputs": [], + "source": [ + "project_id = \"\"\n", + "location = \"\" # us or eu\n", + "fp_processor_id = \"\"\n", + "gcs_input_uri = f\"gs://bucket_name/prefix/to_input/{datetime_suffix}\"\n", + "input_mime_type = \"\" # \"application/pdf\"\n", + "gcs_output_bucket_uri = \"gs://bucket_name\"\n", + "gcs_output_uri_prefix = f\"tables_to_csv/output_folder/prefix/{datetime_suffix}\"\n", + "field_mask = None\n", + "timeout = 5000\n", + "fp_processor_v = \"\" # FP processor V1 or V2 id\n", + "\n", + "gcs_output_bucket = gcs_output_bucket_uri.replace(\"gs://\", \"\")" + ] + }, + { + "cell_type": "markdown", + "id": "bc17cd73-dc06-44c9-9e1c-95b91224b077", + "metadata": {}, + "source": [ + "# 3. Run below code" + ] + }, + { + "cell_type": "markdown", + "id": "070b4fdc-86ca-4fb9-acc7-8882d5e55f02", + "metadata": { + "tags": [] + }, + "source": [ + "Now call `batch_process_documents` function to process all files in input folder(each file contains specific-use-case table only), it results metadata & operation_id of batch process(Long Running Operation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6810abf-b33f-4501-8f8a-d81aaca01d63", + "metadata": {}, + "outputs": [], + "source": [ + "fp_metadata, fp_operation = batch_process_documents(\n", + " project_id,\n", + " location,\n", + " fp_processor_id,\n", + " gcs_input_uri,\n", + " input_mime_type,\n", + " gcs_output_bucket_uri,\n", + " f\"{gcs_output_uri_prefix}/fp_output\",\n", + " field_mask,\n", + " timeout,\n", + " fp_processor_v,\n", + ")\n", + "print(\"FP batch process completed\")" + ] + }, + { + "cell_type": "markdown", + "id": "d83a9335-8e1c-4a3d-9fb2-8ddd1c568605", + "metadata": {}, + "source": [ + "If you configured HITL then you can use below `get_processor_metadata` and `poll_hitl_operations` function, if not you can skip running these two function and proceed with running parse_document_tables function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01269740-9680-4c83-84ed-7b1f2a2836b0", + "metadata": {}, + "outputs": [], + "source": [ + "fp_input_output_map = get_processor_metadata(fp_metadata, fp=True)\n", + "poll_hitl_operations(project_num, location, fp_input_output_map)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba9583c9-3e01-441f-a2ac-135d72a83709", + "metadata": {}, + "outputs": [], + "source": [ + "parse_document_tables(\n", + " gcs_output_bucket,\n", + " f\"{gcs_output_uri_prefix}/fp_output\",\n", + " f\"{gcs_output_uri_prefix}/tables_csv\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "589ae870-f711-4736-ab1e-0b08fa189e74", + "metadata": { + "tags": [] + }, + "source": [ + "# 4. Output Samples" + ] + }, + { + "cell_type": "markdown", + "id": "596f22c7-e68a-4f34-8ce9-e7b4822d31f2", + "metadata": {}, + "source": [ + "Table Sample from pdf file\n", + "![](./Images/line_enhancement_basic_table_img.png)" + ] + }, + { + "cell_type": "markdown", + "id": "6746622b-634a-44fe-9e65-c8cf1c678775", + "metadata": {}, + "source": [ + "Sample output folder structure\n", + "![](./Images/fp_tables_to_csv_output_folder.png)" + ] + }, + { + "cell_type": "markdown", + "id": "9ac67de3-7306-4e41-b046-7021ae086d79", + "metadata": {}, + "source": [ + "output sample for one-table which stored as csv files in GCS bucket\n", + "![](./Images/fp_tables_to_csv_output_csv_sample.png)" + ] + } + ], + "metadata": { + "environment": { + "kernel": "conda-root-py", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/advance_table_line_enhancement/line_enhancement_basic_flow.ipynb b/incubator-tools/advance_table_line_enhancement/line_enhancement_basic_flow.ipynb new file mode 100644 index 000000000..7ffe86c72 --- /dev/null +++ b/incubator-tools/advance_table_line_enhancement/line_enhancement_basic_flow.ipynb @@ -0,0 +1,422 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "79c0a10c-1269-4a48-b4c5-ef4bbbdd2644", + "metadata": {}, + "source": [ + "# Line Enhancement Basic Flow" + ] + }, + { + "cell_type": "markdown", + "id": "0f01fe57-ffc0-402b-81ac-5f77ebd77ece", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "7fa07f2d-5087-4d8d-9ceb-5d46a21e8926", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "a2c6bbc7-49af-4a2d-98b8-09b9b8739e57", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This tool, developed in Python, is designed to optimize the extraction of tables from PDF documents for conversion into CSV files, with a specific focus on enhancing the clarity and readability of table lines before processing. This tool first enhances the table lines in the input PDFs by improving the definition of table borders and lines, it ensures a more accurate and efficient parsing through the Form Parser, ultimately storing the extracted tables in a GCS bucket with enhanced precision." + ] + }, + { + "cell_type": "markdown", + "id": "107ff6ab-a7a9-4e31-b00f-9e7706fca09f", + "metadata": {}, + "source": [ + "# Pre-requisites" + ] + }, + { + "cell_type": "markdown", + "id": "05b0a705-8144-40d4-a92a-15488bd4d1bf", + "metadata": {}, + "source": [ + "This tool requires the following services:\n", + "\n", + " * Vertex AI Notebook instance\n", + " * Access to Document AI Form Parser & CDE Processor\n", + " * GCS Bucket containing input PDFs & to store output results\n", + " \n", + "Google Jupyter Notebook is used for running the python notebook file. Cloud Storage Buckets is needed to store and pass input files to this script & to store results." + ] + }, + { + "cell_type": "markdown", + "id": "0c53efdb-4975-4c19-85c6-cd0262f18cf8", + "metadata": {}, + "source": [ + "CDE for Headers, Create a Custom Document Extractor(CDE) Processor & Configure HITL to review poor performing documents. Train your CDE as per your use-case table by annotating **row headers** & **column headers** for specific use-case-table\n", + "* Input for this step is GCS bucket containing re-builted PDF files(which have only tables), now run `batch_process_documents`\n", + "* Output JSON files will be stored in GCS bucket " + ] + }, + { + "cell_type": "markdown", + "id": "e21aa662-6e89-4839-a902-adcd55d63580", + "metadata": {}, + "source": [ + "Sample image after training CDE processor for row columns & header columns\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
CDE Sample
\n", + "Here are sample row headers and column headers which we followed while training CDE for our specific use-case table \n", + "\n", + "**column headers** are as follow a [\"SCC\", \"DNSH\", \"DNSH_P\", \"code\", \"business_measure\", \"DNSH_BE\", \"DNSH_CCA\", \"DNSH_CCM\", \"DNSH_CE\", \"DNSH_WMR\", \"min_safeguards\", \"proportion_of_bm\", \"SCC_BE\", \"SCC_CCA\", \"SCC_CCM\", \"SCC_CE\", \"SCC_P\", \"SCC_WMR\"] and **row headers** are as follow [\"taxonomy_disclosure\", \"activity\"]" + ] + }, + { + "cell_type": "markdown", + "id": "53acbcf2-499b-481d-a64d-f2f334554577", + "metadata": {}, + "source": [ + "# Script" + ] + }, + { + "cell_type": "markdown", + "id": "2ad3f0db-8f32-4fec-b480-a17cd2b39967", + "metadata": {}, + "source": [ + "# 1. Import Modules/Packages" + ] + }, + { + "cell_type": "markdown", + "id": "5b138c83-b6e6-4900-86fa-33a6247b1360", + "metadata": {}, + "source": [ + "**Note** : Please download the **tool_helper_functions.py** Python file before proceeding to further steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a789a344-ca2e-49a6-b8ce-e7c9b1a0d673", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "from tool_helper_functions import (\n", + " batch_process_documents,\n", + " poll_hitl_operations,\n", + " enhance_and_save_pdfs,\n", + " get_processor_metadata,\n", + " parse_document_tables,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f60bfb4d-f9e2-48de-894a-f0c736ba45e6", + "metadata": {}, + "source": [ + "# 2. Input Details : Configure below Input variables" + ] + }, + { + "cell_type": "markdown", + "id": "72b880fe-b8f5-466d-b7aa-fe387b7008ad", + "metadata": {}, + "source": [ + "* **project_id**: GCP project ID\n", + "* **project_num**: GCP project Number\n", + "* **location**: Processor location `us` or `eu`\n", + "* **cde_processor_id**: CDE processor ID to call batch process\n", + "* **gcs_input_uri**: GCS folder which contains input pdf files(files with only specific-use-case tables)\n", + "* **input_mime_type**: Mime type of input files which is `application/pdf` here\n", + "* **gcs_output_bucket_uri**: GCS output bucket uri without trailing slash\n", + "* **gcs_output_uri_prefix**: GCS output folder path to store results\n", + "* **field_mask**: To store specific keys of document proto (entities,pages.pageNumber)\n", + "* **timeout**: to wait for batch process LRO operation to complete\n", + "* **gcs_cde_hitl_output_prefix**: GCS folder which stored HITL output results fro CDE prpcessor\n", + "* **line_enhancement_vertical_offset**: Offset used to adjust the placement of the vertical lines, it can be tuned based on the layout\n", + "* **line_enhancement_horizontal_offset**: Offset used to adjust the placement of the horizontal lines, it can be tuned based on the layout\n", + "* **flow**: for this notebook file flow is `line_enhancement_basic`\n", + "* **fp_processor_id**: FP Processor ID to call batch process\n", + "* **fp_processor_v1**: FP version ID to call batch process\n", + "* **gcs_line_enhance_output_prefix**: GCS prefix to store Line enhancement results\n", + "* **gcs_fpoutput_uri_prefix**: GCS prefix to store FP results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0d6c085-a232-480e-ba7d-7ac00d47e185", + "metadata": {}, + "outputs": [], + "source": [ + "datetime_suffix = \"{date:%Y-%m-%d_%H:%M:%S}\".format(date=datetime.datetime.now())\n", + "\n", + "project_id = \"\"\n", + "project_num = \"\"\n", + "location = \"\" # us or eu\n", + "cde_processor_id = \"\"\n", + "gcs_input_uri = f\"gs://bucket_name/prefix/to_input/{datetime_suffix}\"\n", + "input_mime_type = \"\" # \"application/pdf\"\n", + "gcs_output_bucket_uri = \"gs://bucket_name\"\n", + "gcs_output_uri_prefix = f\"output_folder/prefix/{datetime_suffix}\"\n", + "field_mask = None # \"entities,pages.pageNumber\"\n", + "timeout = 5000\n", + "gcs_cde_hitl_output_prefix = \"cde-hitl/output/prefix\"\n", + "line_enhancement_vertical_offset = 25\n", + "line_enhancement_horizontal_offset = 5\n", + "\n", + "flow = \"line_enhancement_basic\"\n", + "fp_processor_id = \"\"\n", + "fp_processor_v1 = \"\"\n", + "gcs_line_enhance_output_prefix = \"line_enhancement_basic/output/prefix\"\n", + "gcs_fpoutput_uri_prefix = \"fp_output/prefix\"\n", + "\n", + "\n", + "line_enhance_prefix = f\"{gcs_line_enhance_output_prefix}/{datetime_suffix}\"\n", + "gcs_output_bucket = gcs_output_bucket_uri.replace(\"gs://\", \"\")" + ] + }, + { + "cell_type": "markdown", + "id": "bc17cd73-dc06-44c9-9e1c-95b91224b077", + "metadata": {}, + "source": [ + "# 3. Run below code" + ] + }, + { + "cell_type": "markdown", + "id": "070b4fdc-86ca-4fb9-acc7-8882d5e55f02", + "metadata": {}, + "source": [ + "Now call `batch_process_documents` function to process all files in input folder(each file contains specific-use-case table only), it results metadata & operation_id of batch process(Long Running Operation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a08d85f9-e1eb-4897-a1e8-e8e0abcacdd2", + "metadata": {}, + "outputs": [], + "source": [ + "cde_metadata, cde_operation = batch_process_documents(\n", + " project_id,\n", + " location,\n", + " cde_processor_id,\n", + " gcs_input_uri,\n", + " input_mime_type,\n", + " gcs_output_bucket_uri,\n", + " gcs_output_uri_prefix,\n", + " field_mask,\n", + " timeout,\n", + ")\n", + "print(\"CDE batch process completed\")" + ] + }, + { + "cell_type": "markdown", + "id": "46c39adc-e0bf-4329-a77c-b88eb083f4ce", + "metadata": {}, + "source": [ + "Now use `get_processor_metadata` function from utils module, it takes batch process metsdata as input and results key-value pairs of filenames & it's prefix and hitl operation-id(if input files triggers hitl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2a8006e-2ab7-4644-9529-ef141c7d16ba", + "metadata": {}, + "outputs": [], + "source": [ + "cde_input_output_map = get_processor_metadata(cde_metadata)\n", + "# cde_input_output_map variable data as below\n", + "# {'03_Non-Financial_Corporate_Report_2022_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/0', 'hitl': '12795457638097959002'}, '1962771_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/1', 'hitl': '11860520012484438543'}, '2022_VGT_Group Annual Report_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/5', 'hitl': '2523802694474965110'}, 'DE0007030009-JA-2022-EQ-E-00-pg144_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/3', 'hitl': '14342450698739476592'}, 'DE0007030009-JA-2022-EQ-E-00_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/4', 'hitl': '17242897657994716395'}, 'DE000STRA555-JA-2022-EQ-E-00_extracted.pdf': {'cde': 'msci/TESTING/test_cde_output/2023-11-03_05:45:35/4236894205843634293/2', 'hitl': '2909143051612169782'}}" + ] + }, + { + "cell_type": "markdown", + "id": "31a5e855-b9c8-4fe0-be03-3453f48f79a5", + "metadata": {}, + "source": [ + "`poll_hitl_operations` is a waiting function to check & resolve HITL triggered documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee1ebbcf-a0af-4fba-9390-2c79f6a75984", + "metadata": {}, + "outputs": [], + "source": [ + "poll_hitl_operations(project_num, location, cde_input_output_map)" + ] + }, + { + "cell_type": "markdown", + "id": "9345deef-e48b-41ce-8289-487df8b4ba74", + "metadata": {}, + "source": [ + "Now run `enhance_and_save_pdfs` function from line_enhancement module. Here we are identifying y-coordinates of row-headers to draw horizontal-lines. This is where actual line enhancement process for CDE output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "628e4ea3-1a21-48d5-8588-5f85e0a9b49a", + "metadata": {}, + "outputs": [], + "source": [ + "# line_enhance_prefix = f\"{gcs_line_enhance_output_prefix}/{datetime_suffix}\"\n", + "enhance_and_save_pdfs(\n", + " gcs_output_bucket,\n", + " gcs_cde_hitl_output_prefix,\n", + " line_enhance_prefix,\n", + " cde_input_output_map,\n", + " line_enhancement_vertical_offset,\n", + " line_enhancement_horizontal_offset,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "25e95a52-e22e-476a-b048-1fd508b7cd11", + "metadata": {}, + "source": [ + "After enhancing pdf i.e, *drawing/brightening* horizontal and vertical lines of table is as below \n", + "![](./Images/after_line_enhancement_sample.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6810abf-b33f-4501-8f8a-d81aaca01d63", + "metadata": {}, + "outputs": [], + "source": [ + "gcs_line_output_path = (\n", + " f\"{gcs_output_bucket_uri}/{gcs_line_enhance_output_prefix}/{datetime_suffix}\"\n", + ")\n", + "fp_processor_v = fp_processor_v1\n", + "\n", + "fp_metadata, fp_operation = batch_process_documents(\n", + " project_id,\n", + " location,\n", + " fp_processor_id,\n", + " gcs_line_output_path,\n", + " input_mime_type,\n", + " gcs_output_bucket_uri,\n", + " f\"{gcs_fpoutput_uri_prefix}/{datetime_suffix}\",\n", + " field_mask,\n", + " timeout,\n", + " fp_processor_v,\n", + ")\n", + "print(\"FP batch process completed\")" + ] + }, + { + "cell_type": "markdown", + "id": "e526df11-608f-4673-9268-13b8b04d054a", + "metadata": {}, + "source": [ + "Now use `parse_document_tables` function from utils package. Results will be stored as CSV files in gcs folder of \"{gcs_output_uri_prefix}/{flow}/{datetime_suffix}\", inside this folder we have number of csv files based on count of tables in input files and it follows file name as \"filename_without_extension/pg{page_number}_tb{table_count}.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba9583c9-3e01-441f-a2ac-135d72a83709", + "metadata": {}, + "outputs": [], + "source": [ + "print(flow)\n", + "## parse the FP output and store the csv in GCS bucket\n", + "parse_document_tables(\n", + " gcs_output_bucket,\n", + " f\"{gcs_fpoutput_uri_prefix}/{datetime_suffix}\",\n", + " f\"{gcs_output_uri_prefix}/{flow}/{datetime_suffix}\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8640b89a-82e0-4e4f-8dbd-5bbaac9a35af", + "metadata": {}, + "source": [ + "Refer below image for sample output folder structure, after running `parse_document_tables` function\n", + "![](./Images/leb_folder_sample.png)" + ] + }, + { + "cell_type": "markdown", + "id": "589ae870-f711-4736-ab1e-0b08fa189e74", + "metadata": { + "tags": [] + }, + "source": [ + "# 4. Output Samples" + ] + }, + { + "cell_type": "markdown", + "id": "596f22c7-e68a-4f34-8ce9-e7b4822d31f2", + "metadata": {}, + "source": [ + "Table Sample from pdf file\n", + "![](./Images/line_enhancement_basic_table_img.png)" + ] + }, + { + "cell_type": "markdown", + "id": "9ac67de3-7306-4e41-b046-7021ae086d79", + "metadata": {}, + "source": [ + "output sample for one-table which stored as csv files in GCS bucket\n", + "![](./Images/line_enhancement_basic_table_img(pg1_tb0.csv).png)" + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/advance_table_line_enhancement/tool_helper_functions.py b/incubator-tools/advance_table_line_enhancement/tool_helper_functions.py new file mode 100644 index 000000000..89720b63c --- /dev/null +++ b/incubator-tools/advance_table_line_enhancement/tool_helper_functions.py @@ -0,0 +1,1394 @@ +# pylint: disable=R0913 +# pylint: disable=R0914 +# pylint: disable=E0401 +# pylint: disable=C0302 +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module contains helper functions for Advance Table Parsing Tool""" +import math +import re +import time +from collections import defaultdict +from io import BytesIO +from typing import Dict, List, MutableSequence, Tuple, Union, Any + +import numpy as np +import pandas as pd +import PyPDF2 +from google.api_core.client_options import ClientOptions +from google.api_core.exceptions import InternalServerError, RetryError +from google.cloud import documentai, storage +from google.longrunning import operations_pb2 +from google.longrunning.operations_pb2 import GetOperationRequest +from PIL import Image as PilImage +from PIL import ImageDraw + + +def batch_process_documents( + project_id: str, + location: str, + processor_id: str, + gcs_input_uri: str, + gcs_output_bucket: str, + gcs_output_uri_prefix: str, + field_mask: Union[str, None] = None, + timeout: int = 5000, + processor_version_id: Union[str, None] = None, +) -> Tuple[documentai.BatchProcessMetadata, str]: + """ + Performs batch operation on input gcs folder + Args: + project_id (str): Google Cloud project ID. + location (str): Location of the processor. + processor_id (str): ID of the Document AI processor. + gcs_input_uri (str): Cloud Storage URI for the input GCS folder. + gcs_output_bucket (str): Google Cloud Storage bucket for output. + gcs_output_uri_prefix (str): Output GCS URI prefix. + field_mask (Union[str, None]): Field mask for output. Defaults to None. + timeout (int): Timeout for the operation. Defaults to 5000. + processor_version_id (Union[str, None]): Processor version ID. Defaults to None. + Returns: + Tuple[documentai.BatchProcessMetadata, str]: Tuple containing metadata and operation ID. + """ + + # You must set the api_endpoint if you use a location other than 'us'. + opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") + client = documentai.DocumentProcessorServiceClient(client_options=opts) + # gcs_input_uri = "gs://bucket/direcory_prefix" + print("gcs_input_uri", gcs_input_uri) + gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=f"{gcs_input_uri}/") + input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix) + # Cloud Storage URI for the Output Directory + # This must end with a trailing forward slash `/` + destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/" + print("gcs_output_uri", destination_uri) + gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig( + gcs_uri=destination_uri, field_mask=field_mask + ) + # Where to write results + output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config) + if processor_version_id: + # The full resource name of the processor version, e.g.: + # base_url = "projects/{project_id}/locations/{location}/processors/{processor_id}/" + # url = base_url + "processorVersions/{processor_version_id}" + name = client.processor_version_path( + project_id, location, processor_id, processor_version_id + ) + else: + # The full resource name of the processor, e.g.: + # projects/{project_id}/locations/{location}/processors/{processor_id} + name = client.processor_path(project_id, location, processor_id) + request = documentai.BatchProcessRequest( + name=name, + input_documents=input_config, + document_output_config=output_config, + ) + # BatchProcess returns a Long Running Operation (LRO) + operation = client.batch_process_documents(request) + # Continually polls the operation until it is complete. + # This could take some time for larger files + # Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID + + try: + print(f"Waiting for operation {operation.operation.name} to complete...") + operation.result(timeout=timeout) + # Catch exception when operation doesn't finish before timeout + except (RetryError, InternalServerError) as e: + print(e.message) + + # NOTE: Can also use callbacks for asynchronous processing + # + # def my_callback(future): + # result = future.result() + # + # operation.add_done_callback(my_callback) + + # Once the operation is complete, + # get output document information from operation metadata + metadata = documentai.BatchProcessMetadata(operation.metadata) + if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED: + raise ValueError(f"Batch Process Failed: {metadata.state_message}") + return metadata, operation.operation.name.split("/")[-1] + + +def read_json_output( + output_bucket: str, output_prefix: str, hitl: bool = False +) -> Dict[str, documentai.Document]: + """ + Read the processor json output stored in the GCS bucket. + Args: + output_bucket (str): Google Cloud Storage bucket for the output. + output_prefix (str): Output GCS URI prefix. + hitl (bool): Flag indicating whether Human in the Loop (HITL) is enabled. Defaults to False. + Returns: + Dict[str, documentai.Document]: Dictionary containing Document objects. + """ + + storage_client = storage.Client() + documents = {} + # Get List of Document Objects from the Output Bucket + output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix) + # Document AI may output multiple JSON files per source file + # For current pipeline, assumption is we will have single JSON file + for blob in output_blobs: + # Document AI should only output JSON files to GCS + if ".json" not in blob.name: + print( + f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}" + ) + continue + # Download JSON File as bytes object and convert to Document Object + print(f"Fetching {blob.name}") + document = documentai.Document.from_json( + blob.download_as_bytes(), ignore_unknown_fields=True + ) + if hitl: + documents[blob.name.split("/")[-2]] = document + else: + documents[blob.name.split("/")[-1][:-7]] = document + return documents + + +def get_processor_metadata( + cde_metadatap: documentai.Document, fp: bool = False +) -> Dict[str, Union[str, Dict[str, str]]]: + """ + Parse the processor LRO operation metadata. + Args: + cde_metadatap (documentai.Document): Document containing processor LRO operation metadata. + fp (bool): Flag indicating whether to include file paths. Defaults to False. + + Returns: + Dict[str, Union[str, Dict[str, str]]]: Mapping of file names to output details. + """ + + input_output_map: Dict[str, Union[str, Dict[str, str]]] = {} + for process in cde_metadatap.individual_process_statuses: + filen = process.input_gcs_source.split("/")[-1] + output = "/".join(process.output_gcs_destination.split("/")[3:]) + if fp: + input_output_map[filen] = output + else: + hitl = process.human_review_status.human_review_operation.split("/")[-1] + input_output_map[filen] = {"cde": output, "hitl": hitl} + return input_output_map + + +def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str: + """ + Document AI identifies text in different parts of the document by their + offsets in the entirety of the document's text. This function converts + offsets to a string. + Args: + layout (documentai.Document.Page.Layout): Layout information for a page. + text (str): The entire text content of the document. + Returns: + str: Concatenated text corresponding to the specified layout. + """ + + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in layout.text_anchor.text_segments: + start_index = int(segment.start_index) + end_index = int(segment.end_index) + response += text[start_index:end_index] + return response + + +def get_matched_field(block_text: str, pattern: str = "([0-9]+)") -> str: + """ + Search particular pattern in cell values. + Args: + block_text (str): The text content of the block. + pattern (str): Regular expression pattern to search in the block text. + Defaults to "([0-9]+)". + Returns: + str: Matched field based on the specified pattern. + """ + + m = re.search(pattern, block_text) + t = "" + if m: + for seq, grp in enumerate(m.groups()): + if grp: + if seq == 0: + t = m.group(seq + 1) + else: + t += " " + m.group(seq + 1) + return t.strip() + return t + + +def get_processed_map( + row_map: Dict[int, Dict[str, List[int]]], offset: int +) -> Dict[int, Dict[str, List[int]]]: + """ + Adjust the headers boundaries. + Args: + row_map (Dict[int, Dict[str, List[int]]]): Mapping of rows to headers with boundaries. + offset (int): Offset value for adjusting header boundaries. + Returns: + Dict[int, Dict[str, List[int]]]: Adjusted header boundaries in the processed map. + """ + + processed_map_ = {} + for k, v in row_map.items(): + processed_map = { + i: [round(j[0] / 10) - offset, round(j[1] / 10) + offset] + for i, j in v.items() + if i not in ["DNSH", "SCC"] + } + a, b = processed_map["taxonomy_disclosure"] + processed_map["taxonomy_disclosure"] = [a, b + 2] + processed_map_[k] = processed_map + return processed_map_ + + +def get_coordinates_map( + document: documentai.Document, +) -> Tuple[ + Dict[int, List[List[int]]], + Dict[int, List[int]], + Dict[int, Dict[str, List[int]]], + Dict[int, List[int]], +]: + """ + Get headers and rows coordinates. + Args: + document(documentai.Document): Document containing information. + Returns: + Tuple + """ + + # row_keywords = {"taxonomy","sum","economic","taxonomy-eligible","taxonomy-non-eligible"} + x_coordinates_, y_coord_, row_map_, max_ycd_ = {}, {}, {}, {} + for pn, _ in enumerate(document.pages): + row_coords = [] + x_coordinates = [] + y_coord = [] + row_map = {} + max_ycd = [] + dimension = document.pages[pn].dimension + width, height = dimension.width, dimension.height + # capture min col y of table + ycd_min = math.inf + # capture min row y of table + for entity in document.entities: + pno = entity.page_anchor.page_refs[0].page + if pno != pn: + continue + if entity.type_ in ["DNSH", "SCC"]: + continue + ycd = -1 + xx = [] + for coord in entity.page_anchor.page_refs[ + 0 + ].bounding_poly.normalized_vertices: + x = round(coord.x * width) + y = round(coord.y * height) + if entity.type_ == "activity": + ycd = max(ycd, y) + elif entity.type_ == "taxonomy_disclosure": + row_coords.append(x) + y = round(coord.y * height) + ycd = max(ycd, y) + y_coord.append(y) + elif x not in xx: + xx.append(x) + ycd_min = min(ycd_min, y) + if ycd != -1: + max_ycd.append(ycd) + if xx: + # sort the x1,x2 coordinates before storing + xx.sort() + x_coordinates.append(xx) + row_map[entity.type_] = xx + if row_coords: + row_min_max = [min(row_coords), max(row_coords)] + x_coordinates.append(row_min_max) + row_map["taxonomy_disclosure"] = row_min_max + + # store the min col y of table + if ycd_min != math.inf: + max_ycd.append(math.ceil(ycd_min)) + x_coordinates.sort(key=lambda x: x[0]) + x_coordinates_[pn] = x_coordinates + y_coord.sort() + y_coord_[pn] = y_coord + max_ycd.sort() + max_ycd_[pn] = max_ycd + row_map_[pn] = row_map + return x_coordinates_, y_coord_, row_map_, max_ycd_ + + +def get_operation(location: str, operation_name: str) -> operations_pb2.Operation: + """ + Gets Long Running Operation details. + Args: + location (str): Location of the operation. + operation_name (str): Name of the operation. + Returns: + operations_pb2.Operation: Long Running Operation details. + """ + + # You must set the `api_endpoint` if you use a location other than "us". + opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") + client = documentai.DocumentProcessorServiceClient(client_options=opts) + request = GetOperationRequest(name=operation_name) + operation = client.get_operation(request=request) + return operation + + +def poll_hitl_operations( + project_num: str, location: str, metadata: Dict[str, Dict[str, str]] +) -> None: + """ + Poll Long Running Operation to check status. + Args: + project_num (str): Project number. + location (str): Location of the operation. + metadata (Dict[str, Dict[str, str]]): Metadata containing HITL information. + """ + # use callbacks for asynchronous processing + # def my_callback(future): + # result = future.result() + operations = [] + for v in metadata.values(): + operation_id = v.get("hitl", None) + if operation_id: + operation_name = ( + f"projects/{project_num}/locations/{location}/operations/{operation_id}" + ) + operations.append(operation_name) + num_operations = len(operations) + print(f"Successfully scheduled {num_operations} HITL operations.") + while operations: + operations = [ + operation + for operation in operations + if not get_operation(location, operation).done + ] + if not operations: + break + print( + f"Still waiting for {len(operations)} HITL operations to complete" + ) + time.sleep(100) + print(f"Finished waiting for all {num_operations} HITL operations.") + + +def get_table_data_( + rows: MutableSequence[documentai.Document.Page.Table.TableRow], text: str +) -> List[List[str]]: + """ + Get Text data from table rows + Args: + rows (MutableSequence[documentai.Document.Page.Table.TableRow]): List of table rows. + text (str): Full text of the document. + + Returns: + List[List[str]]: List of lists containing the text data from table rows. + """ + + all_values = [] + for row in rows: + current_row_values = [] + for cell in row.cells: + current_row_values.append( + text_anchor_to_text(cell.layout.text_anchor, text) + ) + all_values.append(current_row_values) + return all_values + + +def text_anchor_to_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str: + """ + Document AI identifies table data by their offsets in the entirety of the + document's text. This function converts offsets to a string. + Args: + text_anchor (object): It contains information about textanchor offsets. + text (str): Full text of the document. + Returns: + str: Converted text based on the specified offsets. + """ + + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in text_anchor.text_segments: + start_index = int(segment.start_index) + end_index = int(segment.end_index) + response += text[start_index:end_index] + return response.strip().replace("\n", " ") + + +def parse_document_tables(output_bucket, output_prefix, output_csv_prefix): + """ + Parse the Form Parser output to extract tables. + Args: + output_bucket (str): Name of the GCS bucket where the output is stored. + output_prefix (str): Prefix for the output files. + output_csv_prefix (str): Prefix for the CSV files to be created. + """ + # storage_client = storage.Client() + # bucket = storage_client.bucket(output_bucket) + # Read the document + doc_obj_dict = read_json_output( + output_bucket=output_bucket, output_prefix=output_prefix + ) + for file_key, document in doc_obj_dict.items(): + for _ , page in enumerate(document.pages): + header_row_values: List[List[str]] = [] + body_row_values: List[List[str]] = [] + for index, table in enumerate(page.tables): + header_row_values = get_table_data_(table.header_rows, document.text) + body_row_values = get_table_data_(table.body_rows, document.text) + # Create a Pandas DataFrame to print the values in tabular format. + df = pd.DataFrame( + data=body_row_values, + columns=pd.MultiIndex.from_arrays(header_row_values), + ) + # Save each table as a CSV file in the GCS bucket + output_filename = ( + f"{output_csv_prefix}/{file_key}/pg{page.page_number}_tb{index}.csv" + ) + df.to_csv(f"gs://{output_bucket}/{output_filename}", index=False) + + +def get_hitl_state(hitl_status_response: operations_pb2.Operation) -> Tuple[bool, str]: + """ + Returns the HITL state and gcs output path if the document is reviewed. + Args: + hitl_status_response (operations_pb2.Operation): HITL status response. + Returns: + Tuple[bool, str]: Tuple containing a boolean indicating whether the document + is reviewed (True if reviewed, False otherwise) and the GCS output path. + """ + + hitl_response = documentai.ReviewDocumentResponse.deserialize( + hitl_status_response.response.value + ) + hitl_status = hitl_response.state.name + hitl_destination = hitl_response.gcs_destination + if hitl_status == "REJECTED": + return False, "" + return True, hitl_destination + + +def parse_and_split_pages( + individual_process_statuses: MutableSequence[ + documentai.BatchProcessMetadata.IndividualProcessStatus + ], + output_bucket_name: str, + output_folder: str, + label: str, + location: str, +) -> None: + """ + Function takes the CDS output, splits it, and produces PDFs containing taxonomy tables, + then stores them in the specified output directory. + Args: + individual_process_statuses (MutableSequence): List of individual process statuses. + output_bucket_name (str): Output bucket name. + output_folder (str): Output folder. + label (str): Taxonomy label. + location (str): Location. + Returns: + None + """ + + client = storage.Client() + for status in individual_process_statuses: + source_bucket_name, source_blob_path = status.input_gcs_source.replace( + "gs://", "" + ).split("/", 1) + operation_id = status.human_review_status.human_review_operation.split("/")[-1] + if operation_id: + print("operation_id:", operation_id) + hitl_status_response = get_operation( + location, status.human_review_status.human_review_operation + ) + state, destination = get_hitl_state(hitl_status_response) + if state: + dest_bucket_name, dest_file_name = destination.replace( + "gs://", "" + ).split("/", 1) + dest_blob_path = "/".join(dest_file_name.split("/")[:-1]) + else: + continue + else: + dest_bucket_name, dest_blob_path = status.output_gcs_destination.replace( + "gs://", "" + ).split("/", 1) + dest_file_name = f"{dest_blob_path}/output-document.json" + source_pdf_name = source_blob_path.split("/")[-1].replace(".pdf", "") + dest_blob_name = f"{dest_blob_path}/{source_pdf_name}.json" + # Copy the source blob to the new location with a new name + # For debugging purpose + client.bucket(dest_bucket_name).copy_blob( + client.bucket(dest_bucket_name).blob(dest_file_name), + client.bucket(dest_bucket_name), + dest_blob_name, + ) + # Read JSON data from GCS + json_blob = client.bucket(dest_bucket_name).blob(dest_blob_name) + json_data = documentai.Document.from_json( + json_blob.download_as_text(), ignore_unknown_fields=True + ) + entities = json_data.entities + taxonomy_page_no = [] + for entity in entities: + if entity.type_ == label: + taxonomy_page_anchor = entity.page_anchor.page_refs + for ta_pa in taxonomy_page_anchor: + page_number = ta_pa.page + taxonomy_page_no.append(page_number) + # Read PDF from GCS + pdf_blob = client.bucket(source_bucket_name).blob(source_blob_path) + pdf_data = BytesIO(pdf_blob.download_as_bytes()) + reader = PyPDF2.PdfReader(pdf_data) + writer = PyPDF2.PdfWriter() + for page_num in taxonomy_page_no: + page = reader.pages[int(page_num)] + writer.add_page(page) + # Write the extracted pages back to GCS + output_pdf_data = BytesIO() + writer.write(output_pdf_data) + output_pdf_blob = client.bucket(output_bucket_name).blob( + f"{output_folder}/{source_pdf_name}_extracted.pdf" + ) + output_pdf_blob.upload_from_string( + output_pdf_data.getvalue(), content_type="application/pdf" + ) + print( + f"Pages {', '.join(map(str, taxonomy_page_no))} extracted to {output_pdf_blob.path}." + ) + + +def get_column_name_type_using_xcoord( + value: int, processed_map: Dict[str, List[int]] +) -> Tuple[Union[str, None], Union[List[str], None]]: + """ + This method returns the name of the column by its horizontal location + It would need to be set on a per-carrier basis and adjusted if the reports + change. Because some cell values span columns we can't auto-detect columns + by drawing vertical lines down the page in places where they don't intersect + with text. + """ + + for col, threshold in dict( + sorted(processed_map.items(), key=lambda item: item[1]) + ).items(): + if threshold[0] <= value <= threshold[1]: + col_string = col.split("_") + if "DNSH" in col_string or "safeguards" in col_string: + return col, ["Y", "N", "N/A", "S", "n/a"] + if "SCC" in col_string or "proportion" in col_string: + return col, ["%"] + if "business" in col_string: + return col, ["number"] + if "code" in col_string: + return col, ["code"] + return col, None + return None, None + + +def get_entire_row( + page: documentai.Document.Page, + block: documentai.Document.Page.Block, + dest_df: pd.DataFrame, + document_response: documentai.Document, + blockn: int, + height: float, + width: float, + processed_map: Dict[str, List[int]], +) -> None: + """ + Method finds the start of each row and then moves through it, collecting columns as it goes. + Args: + page (documentai.Document.Page): Document page. + block (documentai.Document.Page.Block): Document block. + dest_df (pd.DataFrame): Destination DataFrame to store extracted information. + document_response (documentai.Document): Document response. + blockn (int): Block number. + height (float): Height of the document page. + width (float): Width of the document page. + processed_map (Dict[str, List[int]]): Processed map. + Returns: + None + """ + + y_values = [ + round(vertex.y * height) + for vertex in block.layout.bounding_poly.normalized_vertices + ] + min_y = min(y_values) + max_y = max(y_values) + idx = len(dest_df) + # col = 0 + col_block = { + col: blockn + i for i, col in zip(range(dest_df.shape[1]), dest_df.columns) + } + col_occurence = {} + for bn, block1 in enumerate(page.blocks): + # get the min and max y values for each block + y_values = [ + round(vertex.y * height) + for vertex in block1.layout.bounding_poly.normalized_vertices + ] + this_min_y = min(y_values) + 5 + this_max_y = max(y_values) - 5 + # compare if the block coordinates falls under required row block + if this_min_y >= min_y and this_max_y <= max_y: + block_text = layout_to_text(block1.layout, document_response.text) + x_valuesn = [ + round(vertex.x * width) + for vertex in block1.layout.bounding_poly.normalized_vertices + ] + # this_max_x = round(max(x_valuesn) / 10) + this_max_x = math.ceil(max(x_valuesn) / 10) + # get the column name corresponding to the x coordinate + column, col_type = get_column_name_type_using_xcoord( + this_max_x, processed_map + ) + # extract columns specified in CDE + if column: + if col_type == ["number"]: + block_text = get_matched_field( + block_text, + pattern=r"(^\(\d+\))|(\d+[,|]\d+)|(^\(\d+,\d+\))|(\d+)", + ) + elif col_type == ["%"]: + block_text = get_matched_field( + block_text, pattern=r"([0-9]+)|([0-9]+[|%])([0-9]+[|%])" + ) + elif block_text.replace("\n", "") in ["Y", "N", "N/A", "S", "n/a"]: + block_text = get_matched_field( + block_text, pattern="(N/A|Y|N|S|n/a)" + ) + elif col_type == ["code"]: + block_text = get_matched_field( + block_text, + pattern=r"([0-9]+\.[0-9]+\/[0-9]+\.[0-9]+)|([0-9]+\.[0-9]+)", + ) + elif column != "taxonomy_disclosure": + block_text = ( + "" # there are random characters in different block (즘) + ) + try: + dest_df.loc[idx, column] += ( + " " + block_text + ) # add space between 2 blocks text + # col_occurence[column] += 1 + except (TypeError, KeyError): + dest_df.loc[idx, column] = block_text + col_occurence[column] = 1 + if bn > col_block[column]: + dest_df.loc[idx, column] = "-\n" + block_text + # else: + # dest_df.loc[idx, column] = block_text + + +def is_table_region( + layout: documentai.Document.Page.Layout, ystart: int, yend: int, height: float +) -> bool: + """ + Get rows from a particular range. + Args: + layout (documentai.Document.Page.Layout): Layout information for the region. + ystart (int): Starting y-coordinate for the table region. + yend (int): Ending y-coordinate for the table region. + height (float): Height of the document page. + Returns: + bool: True if the layout region corresponds to a table region, False otherwise. + """ + + y_values = [ + round(vertex.y * height) for vertex in layout.bounding_poly.normalized_vertices + ] + min_y = min(y_values) + max_y = max(y_values) + if min_y >= ystart and max_y <= yend: + return True + return False + + +def get_table_data( + document_fp: documentai.Document, + processed_map: Dict[int, Dict[str, List[int]]], + ycord: Dict[int, List[int]], +) -> Dict[int, pd.DataFrame]: + """ + Parse the Form parser OCR output to reconstruct the table. + Args: + document_fp (documentai.Document): Document containing OCR output. + processed_map (Dict[int, Dict[str, List[int]]]): Processed map for coordinates. + ycord (Dict[int, List[int]]): Y-coordinates for each page. + Returns: + Dict[int, pd.DataFrame]: Dictionary of DataFrames, where keys are page numbers. + """ + + df_list = {} + for pgn, page in enumerate(document_fp.pages): + dimension = document_fp.pages[pgn].dimension + width, height = dimension.width, dimension.height + dest_df3 = pd.DataFrame( + columns=list( + dict( + sorted(processed_map[pgn].items(), key=lambda item: item[1]) + ).keys() + ) + ) + ypgn = ycord[pgn] + ystart, yend = ypgn[0] - 2, ypgn[-1] + 2 + for bn, block in enumerate(page.blocks): + block_text = layout_to_text(block.layout, document_fp.text) + if not is_table_region(block.layout, ystart, yend, height): + continue + activity = re.search( + r"(^\d.\d+(.|) [a-zA-Z\s]+)|" + "(^Total)|" + "(^Sum[a-zA-Z1-9\\.+\\s]+)|" + "(^Revenue ([a-z]+))|" + "(^[A-Z]\\.[1-9|\\s][a-zA-Z0-9\\s\\.-]+)|" + "(^OPEX ([a-z]+))|" + "(^CAPEX ([a-z]+))|" + "(^Taxonomy ([a-z]+)|" + "^[A-Z]+[a-z]+)", + block_text, + ) + if activity: + # activity detected: table row + get_entire_row( + page, + block, + dest_df3, + document_fp, + bn, + height, + width, + processed_map[pgn], + ) + df_list[pgn] = dest_df3 + return df_list + + +def update_data( + final_df_: pd.DataFrame, final_data_: Dict[Any, Any], ea: str +) -> Dict[Any, Any]: + """ + Update the final dataframe. + Args: + final_df_ (pd.DataFrame): DataFrame containing the final data. + final_data_ (Dict[Any, Any]): Dict to be updated. + ea (str): Value to be added to the "taxonomy_disclosure" column. + Returns: + Dict[str, List[str]]: Updated defaultdict. + """ + + for column in final_df_.columns: + if column == "taxonomy_disclosure": + final_data_[column].append(ea) + else: + final_data_[column].append("") + return final_data_ + + +def process_taxonomy_disclosure(st: str) -> str: + """ + Process a simple taxonomy disclosure string. + + Args: + st (str): Input string containing a simple taxonomy disclosure. + + Returns: + str: Extracted taxonomy disclosure. + """ + + ea = re.search(r"^[A-Z]\.\s[a-zA-Z\s-]+", st) + if ea: + span = ea.span() + interstr = st[span[0]:span[1]].split("\n")[0] + return interstr + + +def process_taxonomy_disclosure_complex(st: str) -> Tuple[str, str]: + """ + Process a complex taxonomy disclosure string. + + Args: + st (str): Input string containing a complex taxonomy disclosure. + + Returns: + Tuple[str, str]: Tuple containing the remaining string after processing + and the extracted complex taxonomy disclosure. + """ + + ea = re.search(r"^[A-Z]\.[1-9](.|)[a-zA-Z()\s-]+", st) + if ea: + span = ea.span() + interstr = st[span[0]:span[1]].split("\n")[0:-1] + ans = " ".join(interstr) + st = st.replace(st[span[0] : span[1]], "") + return st, ans + + +def process_taxonomy_disclosure_multiple(row: pd.Series) -> None: + """ + Process a row containing multiple taxonomy disclosures. + + Args: + row (pd.Series): Input row containing a "taxonomy_disclosure" column. + + Returns: + None: The "taxonomy_disclosure" column in the row is updated in-place. + """ + + st = row["taxonomy_disclosure"] + row_ea = re.findall(r"\d.\d+ [a-zA-Z\s]+", st) + if len(row_ea) > 1: + row["taxonomy_disclosure"] = "\n".join([ea.replace("\n", " ").strip() for ea in row_ea]) + + +def collect_multiple_values(row: pd.Series, col: str) -> List: + """ + Collect multiple values from a specific column in a row. + + Args: + row (pd.Series): Input row containing the specified column. + col (str): Name of the column containing multiple values. + + Returns: + List: The collected values are appended to the "split_row" list. + """ + + split_row = [] + for val in row[col].split("\n"): + try: + if re.search(r"^[0-9]+(.|,)[0-9]+", val): + split_row.append(val) + except ValueError: + pass + return split_row + + +def collect_and_extend_values(final_df_: pd.DataFrame, final_data_: dict, + row: pd.Series, col: str) -> None: + """ + Collect and extend values from a specific column in a row to the final data structure. + + Args: + final_df_ (pd.DataFrame): Final DataFrame structure. + final_data_ (dict): Final data structure to be extended. + row (pd.Series): Input row containing the specified column. + col (str): Name of the column containing multiple values. + + Returns: + None: The values are extended to the final data structure. + """ + + try: + split_row: List[str] = collect_multiple_values(row, col) + for column in final_df_.columns: + try: + if len(split_row) > 1: + extend_column_data(final_data_, row, column, split_row) + else: + extend_single_value(final_data_, row, column) + except ValueError: + extend_nan_values(final_data_, column, split_row) + except ValueError: + ea_ = row["taxonomy_disclosure"].replace("\n", " ") + final_data_ = update_data(final_df_, final_data_, ea_) + + +def extend_column_data(final_data_: dict, row: pd.Series, + column: str, split_row: List[str]) -> None: + """ + Extend column data in the final data structure. + + Args: + final_data_ (dict): Final data structure to be extended. + row (pd.Series): Input row containing the specified column. + column (str): Name of the column to be extended. + split_row (List[str]): List of values to be extended. + + Returns: + None: The column data is extended in the final data structure. + """ + + column_data = [data for data in row[column].split("\n") if data] + diff = len(split_row) - len(column_data) + if diff != 0: + column_data.extend([np.nan] * diff) + final_data_[column].extend(column_data) + + +def extend_single_value(final_data_: dict, row: pd.Series, column: str) -> None: + """ + Extend single value in the final data structure. + + Args: + final_data_ (dict): Final data structure to be extended. + row (pd.Series): Input row containing the specified column. + column (str): Name of the column to be extended. + + Returns: + None: The single value is extended in the final data structure. + """ + + val = row[column].replace("\n", " ").strip() + if column != "taxonomy_disclosure": + val = val.replace("-", "").strip() + final_data_[column].extend([val]) + + +def extend_nan_values(final_data_: dict, column: str, split_row: List[str]) -> None: + """ + Extend NaN values in the final data structure. + + Args: + final_data_ (dict): Final data structure to be extended. + column (str): Name of the column to be extended. + split_row (List[str]): List of NaN values to be extended. + + Returns: + None: The NaN values are extended in the final data structure. + """ + + final_data_[column].extend([np.nan] * len(split_row)) + + +def post_process( + dest_df: pd.DataFrame, col: str, processed_map: Dict[str, List[int]] +) -> Dict[Any, Any]: + """ + Process the final dataframe to remove noise from the data. + """ + + final_df_ = pd.DataFrame( + columns=list( + dict(sorted(processed_map.items(), key=lambda item: item[1])).keys() + ) + ) + # Post-processing code matches expected values and rearranges them into the final dataframe + final_data_: Dict[Any, Any] = defaultdict(list) + for _ , row in dest_df.iterrows(): + if row["taxonomy_disclosure"] is np.nan: + continue + st = row["taxonomy_disclosure"] + st = st.replace(process_taxonomy_disclosure(row["taxonomy_disclosure"]) + "\n", "").strip() + final_data_ = update_data(final_df_, final_data_, process_taxonomy_disclosure( + row["taxonomy_disclosure"])) + row["taxonomy_disclosure"] = st + + st = row["taxonomy_disclosure"] + st1, ans = process_taxonomy_disclosure_complex(st) + final_data_ = update_data(final_df_, final_data_, ans) + row["taxonomy_disclosure"] = st1 + + process_taxonomy_disclosure_multiple(row) + collect_and_extend_values(final_df_, final_data_, row, col) + # try: + # # collect values if particular col(business measure) has more than one value + # split_row = collect_multiple_values(row, col) + # for column in final_df_.columns: + # try: + # if len(split_row) > 1: + # column_data = [data for data in row[column].split("\n") if data] + + # # if no. of values in particular column doesn't match with n + # diff = len(split_row) - len(column_data) + # if diff != 0: + # column_data.extend([np.nan] * diff) + # final_data_[column].extend(column_data) + # else: + # # remove `-` character + # val = row[column].replace("\n", " ").strip() + # if column != "taxonomy_disclosure": + # val = val.replace("-", "").strip() + + # final_data_[column].extend([val]) + # except ValueError: + # final_data_[column].extend([np.nan] * len(split_row)) + # except ValueError: + # ea_ = row["taxonomy_disclosure"].replace("\n", " ") + # final_data_ = update_data(final_df_, final_data_, ea_) + return final_data_ + + +def run_table_extractor_pipeline( + offset: int, + gcs_output_bucket: str, + gcs_output_uri_prefix: str, + document_fp: documentai.Document, + row_map: Dict[int, Dict[str, List[int]]], + filen: str, + ycord: Dict[int, List[int]], + col: str = "business_measure", +) -> Union[pd.Series, pd.DataFrame]: + """ + Function to parse the data extracted from FP and map with CDE headers + and store the final output as csv in the GCS bucket. + """ + + processed_map = get_processed_map(row_map, offset) + df_list = get_table_data(document_fp, processed_map, ycord) + filen_ = filen[:-4] + for pgn, df in df_list.items(): + final_data_new2 = post_process( + df.copy(), col=col, processed_map=processed_map[pgn] + ) + final_data_2_processed = final_data_new2.copy() + nrows = 0 # num of rows + for _ , v in final_data_new2.items(): + nrows = max(len(v), nrows) + + for _ , v in final_data_2_processed.items(): + length = len(v) + if length != nrows: + v.extend([np.nan] * (nrows - length)) + taxonomy_data: Union[pd.Series, pd.DataFrame] = pd.DataFrame( + final_data_2_processed + ) + taxonomy_data = ( + taxonomy_data[taxonomy_data != ""].dropna(how="all").reset_index(drop=True) + ) + taxonomy_data.to_csv( + f"gs://{gcs_output_bucket}/{gcs_output_uri_prefix}/{filen_}/{pgn}.csv", + index=False, + ) + print("Extraction completed") + return taxonomy_data + + +def walk_the_ocr( + cde_input_output_map: Dict[str, Dict[str, str]], + gcs_output_bucket: str, + gcs_cde_hitl_output_prefix: str, + fp_input_output_map: Dict[str, str], + gcs_output_uri_prefix: str, + offset: int, +) -> None: + """ + Main function to read CDE and FP json output and parse it to get final output. + """ + + for file, data in cde_input_output_map.items(): + print("File:", file) + if data.get("hitl", None): + operation = data["hitl"] + cde_jsons = read_json_output( + output_bucket=gcs_output_bucket, + output_prefix=f"{gcs_cde_hitl_output_prefix}/{operation}", + hitl=True, + ) + cde_document = cde_jsons[operation] + print("HITL") + else: + cde_jsons = read_json_output( + output_bucket=gcs_output_bucket, output_prefix=data["cde"] + ) + cde_document = cde_jsons[file[:-4]] + print("NO HITL") + _ , y_coord, row_map_cde, _ = get_coordinates_map(cde_document) + fp_document_path = fp_input_output_map[file] + fp_document = read_json_output( + output_bucket=gcs_output_bucket, output_prefix=fp_document_path + ) + run_table_extractor_pipeline( + offset=offset, + gcs_output_bucket=gcs_output_bucket, + gcs_output_uri_prefix=gcs_output_uri_prefix, + document_fp=fp_document[file[:-4]], + row_map=row_map_cde, + filen=file, + ycord=y_coord, + ) + + +def draw_vertical( + idx: int, + x_coordinates: Dict[int, List[List[int]]], + hoffset_: float, + min_height: int, + max_height: int, + line_colour: str, + line_width: int, + voffset: int, + draw: ImageDraw.ImageDraw, +) -> None: + """ + Draw vertical lines on an image using the provided coordinates and parameters. + + Args: + idx (int): Index of the line to be drawn. + x_coordinates (Dict[int, List[List[int]]]): List of x-coordinates for the lines. + hoffset_ (float): Horizontal offset for the lines. + min_height (int): Minimum height for the lines. + max_height (int): Maximum height for the lines. + line_colour (str): Color of the lines. + line_width (int): Width of the lines. + voffset (int): Vertical offset for the lines. + draw (ImageDraw.ImageDraw): ImageDraw object for drawing on an image. + + Returns: + None: The function draws vertical lines on the image. + """ + for n, cor in enumerate(x_coordinates[idx]): + if n == 0: + draw.line( + [ + (cor[0] - hoffset_, min_height - hoffset_), + (cor[0] - hoffset_, max_height + hoffset_), + ], + fill=line_colour, + width=line_width, + ) + if ( + n + 1 < len(x_coordinates[idx]) + and (x_coordinates[idx][n + 1][1] + voffset // 2) + - (cor[1] + voffset // 2) + > 50 + ): + draw.line( + [ + (cor[1] + voffset // 2, min_height - hoffset_), + (cor[1] + voffset // 2, max_height + hoffset_), + ], + fill=line_colour, + width=line_width, + ) + elif n + 1 == len(x_coordinates[idx]): + draw.line( + [ + (cor[1] + voffset // 2, min_height - hoffset_), + (cor[1] + voffset // 2, max_height + hoffset_), + ], + fill=line_colour, + width=line_width, + ) + + +def draw_horizontal( + idx: int, + max_ycd: Dict[int, List[int]], + hoffset: Union[int, float], + hoffset_: Union[int, float], + min_x: int, + min_height: int, + max_x: int, + line_colour: str, + line_width: int, + draw: ImageDraw.ImageDraw, +) -> None: + """ + Draw horizontal lines on an image using the provided coordinates and parameters. + + Args: + idx (int): Index of the line to be drawn. + max_ycd (Dict[int, List[int]]): List of y-coordinates for the lines. + hoffset (Union[int, float]): Horizontal offset for the lines. + hoffset_ (Union[int, float]): Another horizontal offset for specific cases. + min_x (int): Minimum x-coordinate for the lines. + min_height (int): Minimum height for the lines. + max_x (int): Maximum x-coordinate for the lines. + line_colour (str): Color of the lines. + line_width (int): Width of the lines. + draw (ImageDraw.ImageDraw): ImageDraw object for drawing on an image. + + Returns: + None: The function draws horizontal lines on the image. + """ + for n, y in enumerate(max_ycd[idx]): + if n == 0: # column header min y coord + draw.line( + ( + min_x - (1 * hoffset), + min_height - hoffset_, + max_x + (1.5 * hoffset), + min_height - hoffset_, + ), + fill=line_colour, + width=line_width, + ) + else: + draw.line( + ( + min_x - (2 * hoffset), + y + hoffset, + max_x + (1.5 * hoffset), + y + hoffset, + ), + fill=line_colour, + width=line_width, + ) + + +def enhance_and_save_pdfs( + output_bucket: str, + gcs_cde_hitl_output_prefix: str, + line_enhance_prefix: str, + cde_input_output_map: Dict[str, Dict[str, str]], + voffset_: int, + hoffset_: Union[int, float], + factor: float = 0.75, +): + """ + Enhance the table structure by drawing the lines based on CDE output, + headers and rows coordinates. + """ + + # Initialize Google Cloud Storage client + storage_client = storage.Client() + bucket = storage_client.bucket(output_bucket) + voffset, hoffset, line_width, line_colour = voffset_, hoffset_, 5, "black" + for file, data in cde_input_output_map.items(): + file_key = file[:-4] + if data.get("hitl", None): + operation = data["hitl"] + cde_jsons = read_json_output( + output_bucket=output_bucket, + output_prefix=f"{gcs_cde_hitl_output_prefix}/{operation}", + hitl=True, + ) + document = cde_jsons[operation] + # print("HITL") + else: + cde_jsons = read_json_output( + output_bucket=output_bucket, output_prefix=data["cde"] + ) + document = cde_jsons[file_key] + # print("NO HITL") + try: + images_for_pdf = [] + for idx, page in enumerate(document.pages): + x_coordinates, _ , _ , max_ycd = get_coordinates_map(document) + image_content = page.image.content + image = PilImage.open(BytesIO(image_content)) + draw = ImageDraw.Draw(image) + min_height, max_height = max_ycd[idx][0], max_ycd[idx][-1] + min_x, max_x = x_coordinates[idx][0][0], x_coordinates[idx][-1][1] + hoffset_ = factor * voffset + # Draw horizontal + if idx in max_ycd: + draw_horizontal(idx, max_ycd, hoffset, hoffset_, min_x, + min_height, max_x, line_colour, line_width, draw) + # for n, y in enumerate(max_ycd[idx]): + # if n == 0: # column header min y coord + # draw.line( + # ( + # min_x - (1 * hoffset), + # min_height - hoffset_, + # max_x + (1.5 * hoffset), + # min_height - hoffset_, + # ), + # fill=line_colour, + # width=line_width, + # ) + # else: + # draw.line( + # ( + # min_x - (2 * hoffset), + # y + hoffset, + # max_x + (1.5 * hoffset), + # y + hoffset, + # ), + # fill=line_colour, + # width=line_width, + # ) + # Drawing vertical lines + if idx in x_coordinates: + draw_vertical(idx, x_coordinates, hoffset_, min_height, + max_height, line_colour, line_width, voffset, draw) + # for n, cor in enumerate(x_coordinates[idx]): + # if n == 0: + # draw.line( + # [ + # (cor[0] - hoffset_, min_height - hoffset_), + # (cor[0] - hoffset_, max_height + hoffset_), + # ], + # fill=line_colour, + # width=line_width, + # ) + # if ( + # n + 1 < len(x_coordinates[idx]) + # and (x_coordinates[idx][n + 1][1] + voffset // 2) + # - (cor[1] + voffset // 2) + # > 50 + # ): + # draw.line( + # [ + # (cor[1] + voffset // 2, min_height - hoffset_), + # (cor[1] + voffset // 2, max_height + hoffset_), + # ], + # fill=line_colour, + # width=line_width, + # ) + # elif n + 1 == len(x_coordinates[idx]): + # draw.line( + # [ + # (cor[1] + voffset // 2, min_height - hoffset_), + # (cor[1] + voffset // 2, max_height + hoffset_), + # ], + # fill=line_colour, + # width=line_width, + # ) + # Append modified image to the list + images_for_pdf.append(image) + # Save images to a single PDF + pdf_stream = BytesIO() + images_for_pdf[0].save( + pdf_stream, + save_all=True, + append_images=images_for_pdf[1:], + resolution=100.0, + quality=95, + optimize=True, + format="PDF", + ) + # Upload PDF to Google Cloud Storage + blob = bucket.blob(f"{line_enhance_prefix}/{file_key}.pdf") + blob.upload_from_string( + pdf_stream.getvalue(), content_type="application/pdf" + ) + print(f"Done Processing -{file_key}.pdf") + except ValueError: + print(f"Issue with processing -{file_key}.pdf") + images_for_pdf = [] + for idx, page in enumerate(document.pages): + image_content = page.image.content + image = PilImage.open(BytesIO(image_content)) + draw = ImageDraw.Draw(image) + # Append original image to the list + images_for_pdf.append(image) + + # Save images to a single PDF + pdf_stream = BytesIO() + images_for_pdf[0].save( + pdf_stream, + save_all=True, + append_images=images_for_pdf[1:], + resolution=100.0, + quality=95, + optimize=True, + format="PDF", + ) + # Upload PDF to Google Cloud Storage + blob = bucket.blob(f"{line_enhance_prefix}/{file_key}.pdf") + blob.upload_from_string( + pdf_stream.getvalue(), content_type="application/pdf" + ) + print("Completed Preprocessing") diff --git a/incubator-tools/backmapping_entities_from_parser_output_to_original_language/README.md b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/README.md new file mode 100644 index 000000000..ddb8f821f --- /dev/null +++ b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/README.md @@ -0,0 +1,39 @@ +# Purpose and Description + +This document guides to backmap the entities from the parser output which is trained in different languages to the original language of the document using google translation API. + +## Workflow to BackMap the Entities to Original language + +workflow.png + + +## Input Details +* **PROJECT_ID**: GCP project ID +* **LOCATION**: Location of DocumentAI Processor, either `us` or `eu` +* **PROCESSOR_ID**: DocumentAI Parser ProcessorID +* **PROCESSOR_VERSION_ID**: DocumentAI Parser processor version id +* **ORIGINAL_SAMPLES_GCS_PATH**: GCS folder apth containing native-language(non-english) documents +* **OUTPUT_BUCKET**: GCS output bucket-name to store results(with-out gs://) +* **OUTPUT_GCS_DIR**: Output folder path to store results in above mentioned output-bucket(with-out gs://) +* **MIME_TYPE**: Mimetype of input documents +* **TRANSLATION**: `True` if you needed translation of documents from non-eng to english language, otherwise `False` +* **BACKMAPPING**: `True` if you needed backamapping of entities from parser-output to original language(non-english), otherwise `False` +* **SAVE_TRANSLATED_PDF**: `True` if you need to store translated doc-results of Cloud Translation API output results +* **ORIGINAL_LANGUAGE**: Provide language code of original documents. eg:- '`de`' for greek input files +* **TARGET_LANGUAGE**: Provide target language code. eg:- '`en`' to sonvert to english +* **DIFF_X**: X-coordinate offset +* **DIFF_Y**: Y-coordinate offset + +## Output Details +1. Raw Document sample(Greek PDF sample) + original_doc_greek.png
+ +2. After Translation from Greek to English using Cloud Translation API + after_translation_greek_to_eng.png\n + +3. After using Translation API, every translated document contains `Machine Translated By Google` text at top-left conrner of translated page + redact_noise_after_translation.png + +4. Sample CSV output file data for comparision between original document entities mention text and translated document mention text + df_comparision_output.png + diff --git a/incubator-tools/backmapping_entities_from_parser_output_to_original_language/backmap_utils.py b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/backmap_utils.py new file mode 100644 index 000000000..9c5fd6d79 --- /dev/null +++ b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/backmap_utils.py @@ -0,0 +1,1255 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=R1702 +# pylint: disable=R0912 +# pylint: disable=R0913 +# pylint: disable=R0914 +# pylint: disable=R0915 +# pylint: disable=E0401 +# pylint: disable=C0302 +"""This module contains helper functions for Backmapping Tool""" +import base64 +import io +import json +import os +import re +from collections import defaultdict +from typing import Any, Dict, List, MutableSequence, Optional, Tuple, Union +import cv2 +import google.auth.transport.requests +import numpy +import pandas as pd +import requests +from dateutil.parser import parse +from fuzzywuzzy import fuzz +from google import auth +from google.cloud import documentai_v1beta3 as documentai +from google.cloud import storage +from PIL import Image + + +def get_access_token() -> Union[str, Any]: + """ + Retrieves and returns an access token for API authentication. + Returns: + str: access token. + """ + credentials, _ = auth.default() + credentials.refresh(google.auth.transport.requests.Request()) + token = credentials.token + return token + + +def download_pdf(gcs_input_path: str, file_prefix: str) -> bytes: + """ + Reads the PDF file from Google Cloud Storage (GCS). + Args: + gcs_input_path (str): GCS document path. + file_prefix (str): Prefix for the GCS document name. + Returns: + pdf_content (bytes): bytes object containing the PDF content. + """ + client = storage.Client() + bucket_name = gcs_input_path.split("/")[2] + bucket = client.bucket(bucket_name) + blob = bucket.blob(file_prefix) + pdf_content = blob.download_as_bytes() + return pdf_content + + +def process_document( + project_id: str, + location: str, + processor_id: str, + processor_version: str, + file_content: Optional[bytes] = None, + file_uri: Optional[str] = None, + mime_type: str = "application/pdf", + is_native: Optional[bool] = False, + ocr: Optional[bool] = True, +) -> documentai.Document: + """ + This function processes a document using either PDF content in bytes or a GCS file URI. + Args: + project_id (str): The project ID for Google Cloud services. + location (str): The location of the Google Cloud project. + processor_id (str): The ID of the processor to use. + processor_version (str): The version of the processor to use. + file_content (bytes): PDF document content in bytes. + file_uri (str): GCS file path. + mime_type (str): Type of document, e.g., application/pdf. + is_native (bool): True, if the input PDF is native. + ocr (bool): True, if running OCR processor. + Returns: + documentai.Document: Parsed JSON data of the document as Document-object. + """ + opts = {"api_endpoint": f"{location}-documentai.googleapis.com"} + client = documentai.DocumentProcessorServiceClient(client_options=opts) + name = client.processor_version_path( + project_id, location, processor_id, processor_version + ) + if file_content: + document = documentai.RawDocument(content=file_content, mime_type=mime_type) + elif file_uri: + document = documentai.GcsDocument(gcs_uri=file_uri, mime_type=mime_type) + else: + raise ValueError("Either 'file_content' or 'file_uri' must be provided.") + process_options = None + if ocr: + ocr_config = documentai.OcrConfig( + enable_native_pdf_parsing=is_native, + enable_image_quality_scores=True, + enable_symbol=True, + ) + process_options = documentai.ProcessOptions(ocr_config=ocr_config) + raw_doc = document if file_content else None + gcs_doc = document if file_uri else None + request = documentai.ProcessRequest( + name=name, + raw_document=raw_doc, + gcs_document=gcs_doc, + skip_human_review=True, + process_options=process_options, + ) + result = client.process_document(request=request) + return result.document + + +def document_to_json(result: documentai.Document) -> Dict[str, Any]: + """ + Converts a Document AI process response to a JSON-friendly format. + Args: + result (documentai.Document): The result object from Document AI processing. + Returns: + Dict[str, Any]: A dictionary representing the JSON format of the processed document. + """ + # Convert the result to a JSON-friendly format + with io.BytesIO() as _: + a = io.BytesIO() + a.write(bytes(documentai.Document.to_json(result), "utf-8")) + a.seek(0) + json_string = a.read().decode("utf-8") + json_data = json.loads(json_string) + return json_data + + +def upload_to_cloud_storage( + filename: str, + data: Union[bytes, Dict[str, Any], pd.DataFrame], + output_bucket: str, + output_prefix: str, +) -> None: + """ + Uploads the document to GCS. + Args: + filename: File name. + data: Bytes, JSON data, or DataFrame to store as a file. + output_bucket: GCS Bucket name. + output_prefix: GCS prefix where to store the file. + """ + storage_client = storage.Client() + bucket = storage_client.bucket(output_bucket) + gcs_uri = f"gs://{output_bucket}/{output_prefix}/{filename}" + blob = bucket.blob(f"{output_prefix}/{filename}") + if isinstance(data, bytes): + # Upload bytes as a file + blob.upload_from_string(data, content_type="application/pdf") + print(f"\tSaved the PDF document to GCS: {gcs_uri}") + elif isinstance(data, dict): + # Convert dictionary to JSON string and upload + json_string = json.dumps(data) + blob.upload_from_string(json_string, content_type="application/json") + print(f"\tSaved the JSON data to GCS: {gcs_uri}") + elif isinstance(data, pd.DataFrame): + # Convert DataFrame to CSV format and upload + csv_content = data.to_csv(index=False) + blob.upload_from_string(csv_content, content_type="text/csv") + print(f"\tSaved the DataFrame to GCS: {gcs_uri}") + else: + print("\tUnsupported data type for upload") + + +def get_redact_bbox_from_text( + text_redact: str, full_text: str, json_data: documentai.Document +) -> Dict[str, List[List[Any]]]: + """ + Extracts the bounding box from the given document for a specific text to redact. + Args: + text_redact (str): The text to redact. + full_text (str): The full text content of the document. + json_data (documentai.Document): The processed Document AI document. + Returns: + Dict[str, List[List[Any]]]: + A dictionary containing page numbers as keys and a list of + bounding boxes as values. + Bounding box format: [x_min, y_min, x_max, y_max]. + """ + part1 = re.escape(text_redact.split(" ")[0]) + part2 = re.escape(text_redact.split(" ")[-1]) + pattern = f"{part1}.*{part2}" + matches = re.finditer(pattern, full_text, flags=re.IGNORECASE) + + redact_bbox = {} + for match in matches: + start, end = match.span() + for page_num, page in enumerate(json_data.pages): + x, y = [], [] + for token in page.tokens: + if token.layout.text_anchor.text_segments: + si = token.layout.text_anchor.text_segments[0].start_index + ei = token.layout.text_anchor.text_segments[0].end_index + if si >= start and ei <= end: + norm_ver = token.layout.bounding_poly.normalized_vertices + x.extend([ver.x for ver in norm_ver]) + y.extend([ver.y for ver in norm_ver]) + + if x and y: # Check if x and y have been modified + bbox = [min(x), min(y), max(x), max(y)] + redact_bbox[str(page_num)] = [bbox] + + return redact_bbox + + +def get_synthesized_images(json_data: documentai.Document) -> List[Image.Image]: + """ + Convert JSON data into a list of images. + Args: + json_data (documentai.Document): Document AI JSON data. + Returns: + List[Image.Image]: List of synthesized images. + """ + synthesized_images = [] + + def decode_image(image_bytes: bytes) -> Image.Image: + with io.BytesIO(image_bytes) as image_file: + image = Image.open(image_file) + image.load() + return image + for page in json_data.pages: + synthesized_images.append(decode_image(page.image.content)) + return synthesized_images + + +def draw_black_box( + synthesized_images: List[Image.Image], + page_wise_bbox: Any , +) -> io.BytesIO: + """ + Draw black boxes around PII entities in synthesized images and add synthetic data. + Args: + synthesized_images (List[Image.Image]): List of synthesized images. + page_wise_bbox (Any): Dictionary with page-wise bounding boxes. + Returns: + io.BytesIO: Byte stream containing the final PDF with black boxes drawn. + """ + open_cv_image = {} + for idx, _ in enumerate(synthesized_images): + open_cv_image[idx] = numpy.array(synthesized_images[idx].convert("RGB")) + img_final = [] + for i, image in open_cv_image.items(): + size = image.shape + for page, bbox_list in page_wise_bbox.items(): + if str(i) == page: + for bbox in bbox_list: + x1 = int(bbox[0] * size[1]) + y1 = int(bbox[1] * size[0]) + x2 = int(bbox[2] * size[1]) + y2 = int(bbox[3] * size[0]) + cv2.rectangle( + image, + (x1, y1), + (x2, y2), + (255, 255, 255), + thickness=cv2.FILLED, + ) + img_temp = Image.fromarray(image) + img_final.append(img_temp) + pdf_stream = io.BytesIO() + img_final[0].save( + pdf_stream, + save_all=True, + append_images=img_final[1:], + resolution=100.0, + quality=95, + optimize=True, + format="PDF", + ) + return pdf_stream + + +def redact( + project_id: str, + location: str, + processor_version: str, + processor_id: str, + pdf_bytes: bytes, + mime_type: str = "application/pdf", +) -> bytes: + """ + Main function to process documents, redact PII entities, and store the result. + Args: + project_id (str): GCP project id. + location (str): Location of the docai processor. + processor_version (str): DocAI processor version. + processor_id (str): DocAI processor id. + pdf_bytes (bytes): PDF document bytes to process. + mime_type (str, optional): Type of document, e.g., "application/pdf". + Defaults to "application/pdf". + Returns: + bytes: Redacted PDF document bytes. + """ + redact_text = ["Machine Translated by Google"] + json_data = process_document( + project_id, + location, + processor_id, + processor_version, + pdf_bytes, + None, + mime_type, + False, + False, + ) + redact_bbox: Dict[str, Any] = {} + try: + for t1 in redact_text: + page_wise_bbox_text = get_redact_bbox_from_text( + t1, json_data.text, json_data + ) + for p1, b1 in page_wise_bbox_text.items(): + if p1 in redact_bbox: + redact_bbox[p1].extend(b1) + else: + redact_bbox[p1] = b1 + except ValueError: + pass + synthesized_images = get_synthesized_images(json_data) + pdf_stream = draw_black_box(synthesized_images, redact_bbox) + redacted_pdf_stream = pdf_stream.getvalue() + return redacted_pdf_stream + + +def get_min_max_x_y( + bounding_box: MutableSequence[documentai.NormalizedVertex], +) -> Tuple[float, float, float, float]: + """ + Function returns min-max x & y coordinates from the entity bounding box. + Args: + bounding_box (MutableSequence[documentai.NormalizedVertex]): + A list of vertices representing the bounding bo + Returns: + min_max_x_y (Tuple[float, float, float, float]): + Minimum and maximum x,y coordinates of entity bounding box. + """ + min_x = min(item.x for item in bounding_box) + min_y = min(item.y for item in bounding_box) + max_x = max(item.x for item in bounding_box) + max_y = max(item.y for item in bounding_box) + min_max_x_y = (min_x, max_x, min_y, max_y) + return min_max_x_y + + +def get_normalized_vertices( + coords: Dict[str, float] +) -> MutableSequence[documentai.NormalizedVertex]: + """ + It takes XY-Coordinates & creates normalized-verices for Document Object + Args: + coords (Dict[str, float]): It contains min&max xy-coordinates + Returns: + MutableSequence[documentai.NormalizedVertex]: + It returns list containing 4 NormalizedVertex Objects + """ + coords_order = [ + ("min_x", "min_y"), + ("max_x", "min_y"), + ("max_x", "max_y"), + ("min_x", "max_y"), + ] + nvs = [] + for x, y in coords_order: + nvs.append(documentai.NormalizedVertex(x=coords[x], y=coords[y])) + return nvs + + +def get_formatted_dates(main_string: str) -> Dict[str, str]: + """ + This function checks for dates in the given string and returns them in a specific format. + Args: + main_string (str): The input string containing dates. + Returns: + formatted_dates(Dict[str, str]): + A dictionary containing original dates as keys and formatted dates as values. + """ + # Regular expression pattern to find dates + date_pattern = ( + r"\b(?:\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}-\d{1,2}-\d{2,4}|" + r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2} \d{4}|" + r"(?:\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\.\d{2}\.\d{2}))\b" + ) + # Find and format matching dates in the main string + dates = re.findall(date_pattern, main_string.replace("/n", " ")) + formatted_dates = {} + for date_str in dates: + try: + date_obj = parse(date_str, fuzzy=True) + formatted_date = date_obj.strftime("%d/%m/%Y") + formatted_dates[date_str] = formatted_date + except ValueError: + # Handle invalid date formats gracefully + pass + return formatted_dates + + +def find_matched_translation_pairs( + entity: documentai.Document.Entity, translation_api_output: List[Dict[str, str]] +) -> List[Dict[str, str]]: + """ + Function returns the best mapping text pairs with entity mention text. + Args: + entity (documentai.Document.Entity): Document AI extracted entity dictionary. + translation_api_output (List[Dict[str, str]])): + Mapping dictionary of source and corresponding translated text. + Returns: + best_match_pairs (List[Dict[str, str]]): + Pairs which are best matched with entity mention text. + """ + pattern = r"^[0-9,.\n \\]*$" + regex = re.compile(pattern) + ent_mt = entity.mention_text + if regex.match(ent_mt): + best_match_pairs = [{"sourceText": ent_mt, "targetText": ent_mt}] + else: + def similar(a, b): + return fuzz.ratio(a, b) + target_lines = ent_mt.split("\n") + best_match_pairs = [] + for target_line in target_lines: + best_match_score = 0 + best_match_pair = None + for entry in translation_api_output: + target_text = entry["targetText"] + similarity_score = similar(target_line.strip(), target_text.strip()) + if similarity_score > best_match_score: + best_match_score = similarity_score + best_match_pair = entry + if best_match_pair: + best_match_pairs.append(best_match_pair) + return best_match_pairs + + +def get_page_text_anc_mentiontext( + entity: documentai.Document.Entity, + orig_invoice_json: documentai.Document, + min_max_x_y: Tuple[float, float, float, float], + mapping_text: str, + diff_y: float, + diff_x: float, + english_page_num: int, +) -> Tuple[ + Any, + Dict[str, Dict[Any, Any]], + str, + List[List[str]], + str, +]: + """ + Function returns the min-max coordinates, text anchors and mention text of backmapped entity + using provided extracted source entity, corresponding x&y coordinates, and + mapping text from translation api output, with coordinate offset check. + Args: + entity (documentai.Document.Entity): Document AI extracted entity dictionary. + orig_invoice_json (documentai.Document): Original document Invoice processor json output. + min_max_x_y (Tuple[float, float, float, float]): + Minimum and maximum x&y coordinates of entity bounding box. + mapping_text (str): Source text from translation text units. + diff_y (float): Y-coordinate offset. + diff_x (float): X-coordinate offset. + english_page_num (int): Document page number. + Returns: + Tuple[ + Any, + Dict[str, Dict[Any, Any]], + str, + List[List[str]], + str + ] + - bbox (Dict[str, float]): + Dictionary containing min-max x&y coordinates of the mapped entity. + - expected_text_anc (Dict[ + str, + str] + ]): + List of start and end indexes of the mapped entity. + - new_mention_text (str): Mapped entity text. + - match_string_pair (List[List[str]]): List of matched string pairs. + - method (str): Based on mapping block. + """ + min_x, _, min_y, _ = min_max_x_y + matches: List[Any] = [] + match_string_pair: List[Any] = [] + method = "" + # Track whether entity is matched from OCR or Translated units + orig_text = orig_invoice_json.text + try: + matches, match_string_pair = find_substring_indexes( + orig_text, entity.mention_text + ) + if matches: + method = "OCR-EntityMT" + except ValueError: + matches, match_string_pair = find_substring_indexes(orig_text, mapping_text) + if matches: + method = "OCR-TU" + if not matches: + matches, match_string_pair = find_substring_indexes(orig_text, mapping_text) + if matches: + method = "OCR-TU" + if not matches: + dates_german_text = get_formatted_dates(orig_text) + ent_date = get_formatted_dates(entity.mention_text) + matched_dates = defaultdict(list) + for k1, v1 in ent_date.items(): + for k2, v2 in dates_german_text.items(): + if v1 == v2: + matched_dates[k1].append(k2) + for _, mat_1 in matched_dates.items(): + for mat_11 in mat_1: + match_temp, match_string_pair_temp = find_substring_indexes( + orig_text, mat_11 + ) + for mat_2, str_pair in zip(match_temp, match_string_pair_temp): + matches.append(mat_2) + match_string_pair.append(str_pair) + if matches: + method = "OCR-EntityMT" + # Initialize variables. + bbox, text_anc_1, new_mention_text, expected_text_anc = {}, {}, "", {} + # Iterate through match pairs. + for match, str_pair in zip(matches, match_string_pair): + try: + _ts = documentai.Document.TextAnchor.TextSegment( + start_index=int(match[0]), end_index=int(match[1]) + ) + bb, text_anc = get_token(orig_invoice_json, english_page_num, [_ts]) + except ValueError: + continue + # bb can have empty string return by get_token + if not bb: + continue + # Difference between the original and mapped bbox should be within defined offset. + cond1, cond2 = abs(bb["min_y"] - min_y) <= diff_y, abs(bb["min_x"] - min_x) <= diff_x + if cond1 and cond2: + diff_x = abs(bb["min_x"] - min_x) + diff_y = abs(bb["min_y"] - min_y) + bbox = bb + text_anc_1 = text_anc + for index, an3 in enumerate(text_anc_1): + si = an3.start_index + ei = an3.end_index + ent_text = orig_text[si:ei] + cond1 = index in [0, len(text_anc_1) - 1] + cond2 = ent_text.strip() in [")", "(", ":", " ", "/", "\\"] + if cond1 and cond2: + continue + new_mention_text += ent_text + expected_text_anc = {"textSegments": text_anc_1} + return bbox, expected_text_anc, new_mention_text, match_string_pair, method + + +def updated_entity_secondary( + orig_invoice_json: documentai.Document, + min_max_x_y: Tuple[float, float, float, float], + mapping_text: str, + english_page_num: int, +) -> Tuple[ + Any, + Any, + str, + List[List[Any]], + str, +]: + """ + Function returns the min-max coordinates, text anchors and mention text of backmapped entity + using provided extracted source entity x&y coordinates, and + mapping text from translation api output, with original document tokens. + Args: + orig_invoice_json (documentai.Document): Original document Invoice processor json output. + min_max_x_y (Tuple[float, float, float, float]): + Minimum and maximum x&y coordinates of entity bounding box. + mapping_text (str): Source text from translation text units. + english_page_num (int): Document page number. + Returns: + - Tuple[ + Any, + Any, + str, + List[List[Any]], + str + ]: + - updated_page_anc (Any): + Dictionary containing min-max x&y coordinates of the mapped entity. + - updated_text_anc (Any): + List of start and end indexes of the mapped entity.. + - mentiontext (str): Mapped entity text. + - match_string_pair (List[List[Any]]): List of matched string pairs. + - method (str): Based on mapping block. + """ + min_x, max_x, min_y, max_y = min_max_x_y + text_anc_tokens = [] + confidence = [] + page_anc: Dict[str, List[float]] = {"x": [], "y": []} + mapping_list = mapping_text.split() + method = "OCR-TU" + match_string_pair = [] + for token in orig_invoice_json.pages[english_page_num].tokens: + norm_vert = token.layout.bounding_poly.normalized_vertices + new_min_x, new_max_x, new_min_y, new_max_y = get_min_max_x_y(norm_vert) + cond11 = abs(new_min_y - min_y) <= 0.01 + cond12 = abs(new_max_y - max_y) <= 0.01 + cond1 = cond11 and cond12 + cond21 = ( + new_min_x >= min_x and + new_min_y >= min_y and + new_max_x <= max_x and + new_max_y <= max_y + ) + if not (cond1 or cond21): + continue + text_anc_token = token.layout.text_anchor.text_segments + si, ei = text_anc_token[0].start_index, text_anc_token[0].end_index + orig_temp_text = orig_invoice_json.text[si:ei].strip().lower() + mapping_text_stripped = mapping_text.strip().lower() + if orig_temp_text in mapping_text_stripped: + for t3 in list(mapping_list): + ratio = fuzz.ratio(t3.lower(), orig_temp_text) + if ratio > 75: + mapping_list.remove(t3) + match_string_pair.append([t3, orig_temp_text]) + text_anc_tokens.extend(text_anc_token) + page_anc["x"].extend([new_min_x, new_max_x]) + page_anc["y"].extend([new_min_y, new_max_y]) + confidence.append(0.9) + sorted_temp_token = sorted(text_anc_tokens, key=lambda x: x.end_index) + temp_mention_text = "" + for index, seg in enumerate(sorted_temp_token): + si = seg.start_index + ei = seg.end_index + ent_text = orig_invoice_json.text[si:ei] + cond1 = index in [0, len(sorted_temp_token)] + cond2 = ent_text.strip() in [")", "(", ":", " ", "/", "\\"] + if cond1 and cond2: + continue + temp_mention_text += ent_text + try: + updated_page_anc = { + "min_x": min(page_anc["x"]), + "min_y": min(page_anc["y"]), + "max_x": max(page_anc["x"]), + "max_y": max(page_anc["y"]), + } + except ValueError: + # recheck the format + updated_page_anc = { + "min_x": min_x, + "min_y": min_y, + "max_x": max_x, + "max_y": max_y, + } + temp_mention_text = mapping_text + method = "OCR-TU-Direct" + # This sorted_temp_token(list(text_segment-object)) is list(text_segments[0]...(>1)) + updated_text_anc = {"textSegments": sorted_temp_token} + return ( + updated_page_anc, + updated_text_anc, + temp_mention_text, + match_string_pair, + method, + ) + + +def get_token( + json_dict: documentai.Document, + page: int, + text_anchors_check: MutableSequence[documentai.Document.TextAnchor.TextSegment], +) -> Tuple[ + Union[Dict[str, float], None], + Any, +]: + """ + This function takes a loaded JSON, page number, and text anchors as input + and returns the text anchors and page anchors. + Args: + json_dict (documentai.Document): The loaded JSON document. + page (int): The page number. + text_anchors_check (MutableSequence[documentai.Document.TextAnchor.TextSegment]): + List of text anchors to check. + Returns: + Tuple[ + Union[Dict[str, float], None], + Union[MutableSequence[documentai.Document.TextAnchor.TextSegment], None] + ] + - A tuple containing the final page anchors, text anchors, and confidence. + """ + temp_text_anc = [] + temp_confidence = [] + temp_ver: Dict[str, List[float]] = {"x": [], "y": []} + ta_si = text_anchors_check[0].start_index + ta_ei = text_anchors_check[0].end_index + for token in json_dict.pages[page].tokens: + text_segs = token.layout.text_anchor.text_segments + si = text_segs[0].start_index + ei = text_segs[0].end_index + if text_segs == text_anchors_check: + text_temp = json_dict.text[si:ei] + cond2 = "\n" not in text_temp and len(text_temp) <= 2 + if len(text_temp) > 2 or cond2: + norm_verts = token.layout.bounding_poly.normalized_vertices + min_x, max_x, min_y, max_y = get_min_max_x_y(norm_verts) + temp_text_anc = text_segs + elif si >= ta_si - 2 and ei <= ta_ei + 2: + text_temp = json_dict.text[si:ei] + if len(text_temp) > 2 or "\n" not in text_temp: + norm_verts = token.layout.bounding_poly.normalized_vertices + min_x, max_x, min_y, max_y = get_min_max_x_y(norm_verts) + temp_ver["x"].extend([min_x, max_x]) + temp_ver["y"].extend([min_y, max_y]) + text_anc_token = text_segs + for an1 in text_anc_token: + temp_text_anc.append(an1) + temp_confidence.append(token.layout.confidence) + if not temp_text_anc: + for token in json_dict.pages[page].tokens: + text_segs = token.layout.text_anchor.text_segments + ts_si = text_segs[0].start_index + ts_ei = text_segs[0].end_index + if abs(ts_si - ta_si) <= 2: + text_temp = json_dict.text[ts_si:ts_ei] + if len(text_temp) > 2 or "\n" not in text_temp: + norm_verts = token.layout.bounding_poly.normalized_vertices + min_x, max_x, min_y, max_y = get_min_max_x_y(norm_verts) + temp_text_anc = text_segs + if temp_text_anc and temp_ver["x"]: + final_ver = { + "min_x": min(temp_ver["x"]), + "min_y": min(temp_ver["y"]), + "max_x": max(temp_ver["x"]), + "max_y": max(temp_ver["y"]), + } + final_text_anc = sorted(temp_text_anc, key=lambda x: x.end_index) + return final_ver, final_text_anc + # else: + return None, None + + +def get_updated_entity( + entity: documentai.Document.Entity, + orig_invoice_json: documentai.Document, + translation_api_output: List[Dict[str, str]], + english_page_num: int, + diff_y: float = 0.05, + diff_x: float = 0.3, +) -> Tuple[ + Dict[str, Any], + List[Any], + str, + List[List[str]], + str, + List[Dict[str, str]], +]: + """ + Function maps the entity from source to target and gives the back mapped entity. + Args: + entity (documentai.Document.Entity): Document AI extracted entity dictionary. + orig_invoice_json (documentai.Document): Original document Invoice processor json output. + translation_api_output (List[Dict[str, str]]): + Mapping dictionary of source and corresponding translated text. + english_page_num (int): Document page number. + diff_y (float): Y-coordinate offset. + diff_x (float): X-coordinate offset. + Returns: + Tuple[ + Dict[str, Any], + List[Any], + str, List[List[str]], str,List[Dict[str, str]] + ] + - main_page_anc (Dict[str, List[float]]): + Dictionary containing min-max x&y coordinates of the mapped entity. + - main_text_anc (MutableSequence[documentai.Document.TextAnchor.TextSegment]): + List of start and end indexes of the mapped entity. + - main_mentiontext (str): Mapped entity text. + - unique_list (List[List[str]]): Unique list of matched string pairs. + - method (str): Based on mapping block. + - mapping_text_list (List[Dict[str, str]]]): + List of matched translation units with entity text. + """ + # Get matched translated text units + mapping_text_list = find_matched_translation_pairs(entity, translation_api_output) + main_mentiontext = "" + main_text_anc = [] + main_page_anc1: Dict[str, List[float]] = {"x": [], "y": []} + english_bb_area = entity.page_anchor.page_refs[0].bounding_poly.normalized_vertices + min_max_x_y = get_min_max_x_y(english_bb_area) + updated_page_anc: Dict[str, float] = {} + method = "" + mentiontext = "" + match_str_pair: List[Any] = [] + # Iterate over matched pairs {source: other lang, target: english}. + for map_text in mapping_text_list: + ( + updated_page_anc, + updated_text_anc, + mentiontext, + match_str_pair, + method, + ) = get_page_text_anc_mentiontext( + entity, + orig_invoice_json, + min_max_x_y, + map_text["sourceText"], + diff_y, + diff_x, + english_page_num, + ) + if len(updated_page_anc) == 0: + ( + updated_page_anc, + updated_text_anc, + mentiontext, + match_str_pair, + method, + ) = updated_entity_secondary( + orig_invoice_json, + min_max_x_y, + map_text["sourceText"], + english_page_num, + ) + if updated_page_anc: + main_page_anc1["x"].extend([updated_page_anc["min_x"], updated_page_anc["max_x"]]) + main_page_anc1["y"].extend([updated_page_anc["min_y"], updated_page_anc["max_y"]]) + for text_anc in updated_text_anc["textSegments"]: + if text_anc not in main_text_anc: + main_text_anc.append(text_anc) + main_text_anc = sorted(main_text_anc, key=lambda x: x.end_index) + if main_text_anc: + for index, text_an1 in enumerate(main_text_anc): + si = text_an1.start_index + ei = text_an1.end_index + ent_text = orig_invoice_json.text[si:ei] + # remove unwanted chars + cond1 = index in [0, len(main_text_anc) - 1] + punctuations = [")", "(", ":", " ", "/", "\\"] + if cond1 and ent_text.strip() in punctuations: + continue + main_mentiontext += ent_text + else: + main_mentiontext = mentiontext + unique_list = [] + for item in match_str_pair: + if item not in unique_list: + unique_list.append(item) + main_page_anc2 = { + "min_x": min(main_page_anc1["x"]), + "min_y": min(main_page_anc1["y"]), + "max_x": max(main_page_anc1["x"]), + "max_y": max(main_page_anc1["y"]), + } + text_anchor = documentai.Document.TextAnchor() + text_anchor.text_segments = main_text_anc + return ( + main_page_anc2, + text_anchor.text_segments, + main_mentiontext, + unique_list, + method, + mapping_text_list, + ) + + +def find_substring_indexes( + text: str, substring: str +) -> Tuple[List[Tuple[int, int]], List[List[str]]]: + """ + Function returns the start and end indexes of all the matches + between ocr text and mapping/mention text. + Args: + text (str): DocAI OCR text output. + substring (str): Translation API mapping text/DocAI entity mentioned text. + Returns: + Tuple[List[Tuple[int, int]], List[List[str]]] + - matches (List[Tuple[int, int]]): List of start and end indexes of all the matches. + - match_string_pair (List[List[str]]): List of pair of matched strings. + """ + substring = substring.replace(",", ".") + list_str = substring.strip().split() + matches = [] + text_1 = text.replace(",", ".").lower() + match_string_pair = [] + if len(list_str) == 1: + list_str_1 = list_str[0].replace(",", ".") + pattern = re.compile(re.escape(list_str_1), re.IGNORECASE) + for match in pattern.finditer(text_1): + si = match.start() + ei = match.end() + ratio = fuzz.ratio(substring, text[si:ei]) + if ratio > 0.8: + matches.append((si, ei)) + match_string_pair.append([substring, text[si:ei]]) + else: + part = f"{re.escape(substring.strip())}" + pattern = re.compile(part, re.IGNORECASE) + # remove new line if present + text1 = text_1.replace("\n", " ").strip() + for match in pattern.finditer(text_1): + si = match.start() + ei = match.end() + delta = abs(ei - si) + cond1 = delta <= len(substring) + 15 + cond2 = delta >= abs(len(substring) * 0.75) + if cond1 and cond2: + ratio = fuzz.ratio(substring, text1[si:ei]) + if ratio > 0.8: + matches.append((si, ei)) + match_string_pair.append([substring, text1[si:ei]]) + return matches, match_string_pair + + +def translation_text_units( + project_id: str, + location: str, + processor_version: str, + processor_id: str, + target_language: str, + source_language: str, + input_uri: str, + output_gcs_bucket: str, + output_gcs_prefix: str, + save_translated_doc: Optional[bool] = False, + is_native: Optional[bool] = False, + remove_shadow: Optional[bool] = True, +) -> Tuple[bytes, List[Dict[str, str]], Dict[str, Any]]: + """ + Function to translate the document from source to target language. + Args: + project_id (str): GCP project id. + location (str): Location of the docai processor. + processor_version (str): DocAI processor version. + processor_id (str): DocAI processor id. + target_language (str): Language to which document is to be translated. + source_language (str): Document source language. + input_uri (str): GCS Document URI. + output_gcs_bucket (str): GCS bucket name. + output_gcs_prefix (str): GCS prefix where to store the file. + save_translated_doc (bool, optional): True, to save the translated doc. Defaults to False. + is_native (bool, optional): True, if input doc is native. Defaults to False. + remove_shadow (bool, optional): + True, to remove the shadow text from translated doc. Defaults to True. + Returns: + Tuple[bytes, List[Dict[str, str]], Dict[str, Any]]: A tuple containing + - pdf_bytes: Bytes of the translated document. + - doc_text_units: Mapping dictionary of source and corresponding translated text. + - json_response: Translated API json response. + """ + # Translation API. + url = (f"https://translate.googleapis.com/v3/projects/{project_id}" + f"/locations/global:translateDocument") + headers = { + "content-type": "application/json", + "Authorization": f"Bearer {get_access_token()}", + } + # request-module is the only way and also users need to raise a request + # to enable this feature(output_text_unit) in their project + json_obj = { + "source_language_code": source_language, + "target_language_code": target_language, + "document_input_config": {"gcs_source": {"input_uri": input_uri}}, + "output_text_unit": "True", + "is_translate_native_pdf_only": is_native, + "enable_shadow_removal_native_pdf": remove_shadow, + } + x = requests.post(url, json=json_obj, headers=headers, timeout=300) + if x.status_code != 200: + print(f"\tstatus_code: {x.status_code}, reason: {x.reason}") + json_response = x.json() + doc_trans = json_response["documentTranslation"] + pdf_bytes = base64.b64decode(doc_trans["byteStreamOutputs"][0]) + redacted_pdf_bytes = redact( + project_id, location, processor_version, processor_id, pdf_bytes + ) + if save_translated_doc: + # save translated document + filename = input_uri.split("/")[-1] + upload_to_cloud_storage( + filename, + redacted_pdf_bytes, + output_gcs_bucket, + os.path.join(output_gcs_prefix, "translated_pdfs"), + ) + doc_text_units = doc_trans["textUnits"] + return redacted_pdf_bytes, doc_text_units, json_response + + +def run_consolidate( + english_invoice_doc: documentai.Document, + orig_invoice_doc: documentai.Document, + translation_api_output: List[Dict[str, str]], + diff_x: float, + diff_y: float, + lang: str, +) -> Tuple[pd.DataFrame, documentai.Document]: + """ + This function takes the source, target parsed jsons, and text units as input + and gives an updated json and comparison dataframe as output. + Args: + english_invoice_json (documentai.Document): + JSON output from DocAI Invoice parser when translated. + orig_invoice_json (documentai.Document): + JSON output from DocAI Invoice parser for the target language. + translation_api_output (List[Dict[str, str]]): Text units (sourceText, targetText pairs). + diff_y: Y-coordinate offset. + diff_x: X-coordinate offset. + lang: Original document language. + Returns: + Tuple[pd.DataFrame, documentai.Document] : + - df (pd.DataFrame): Comparison dataframe. + - orig_invoice_json (documentai.Document): Updated Original invoice JSON. + """ + _updated_entities = [] + updated_text_anchor: List[Any] = [] + df = pd.DataFrame( + columns=[ + "English_entity_type", + "English_entity_MT", + "Original_entity_MT", + "match_pair", + "method", + "map_text_list", + "English_entity_bbox", + "Original_entity_bbox", + "Language", + ] + ) + for _entity in english_invoice_doc.entities: + # Error handling: no pageanchor for entity + if not _entity.page_anchor: + print("************") + continue + english_page_num = int(_entity.page_anchor.page_refs[0].page) + try: + if not _entity.properties: + ent_eng_type = _entity.type_ + ent_eng_mt = _entity.mention_text + pgrfs = _entity.page_anchor.page_refs[0] + bounding_box = pgrfs.bounding_poly.normalized_vertices + ent_eng_bbox = get_min_max_x_y(bounding_box) + ( + updated_page_anchor, + updated_text_anchor, + mentiontext, + match_str_pair, + method, + map_text_list, + ) = get_updated_entity( + _entity, + orig_invoice_doc, + translation_api_output, + english_page_num, + diff_y, + diff_x, + ) + nvs = get_normalized_vertices(updated_page_anchor) + _bounding_poly = documentai.BoundingPoly(normalized_vertices=nvs) + _entity.confidence = 1 + _entity.mention_text = mentiontext + page_refs = documentai.Document.PageAnchor.PageRef( + page=english_page_num, bounding_poly=_bounding_poly + ) + _entity.page_anchor = documentai.Document.PageAnchor( + page_refs=[page_refs] + ) + _ts = updated_text_anchor[0] + _entity.text_anchor.text_segments = [_ts] + _entity.text_anchor.content = mentiontext + if _entity.normalized_value: + del _entity.normalized_value + _updated_entities.append(_entity) + df.loc[len(df.index)] = [ + ent_eng_type, + ent_eng_mt.strip("\n"), + mentiontext.strip("\n"), + match_str_pair, + method, + map_text_list, + ent_eng_bbox, + [ + updated_page_anchor["min_x"], + updated_page_anchor["min_y"], + updated_page_anchor["max_x"], + updated_page_anchor["max_y"], + ], + lang, + ] + else: + _child_properties = [] + _parent_text_segments = [] + _parent_x_y: Dict[str, List[Any]] = {"x": [], "y": []} + for _child_ent in _entity.properties: + child_ent_eng_type = _child_ent.type_ + child_ent_eng_mt = _child_ent.mention_text + pgrfs = _child_ent.page_anchor.page_refs[0] + bounding_box_1 = pgrfs.bounding_poly.normalized_vertices + child_ent_eng_bbox = get_min_max_x_y(bounding_box_1) + try: + try: + ( + updated_page_anchor, + updated_text_anchor, + mentiontext, + match_str_pair, + method, + map_text_list, + ) = get_updated_entity( + _child_ent, + orig_invoice_doc, + translation_api_output, + english_page_num, + 0.01, + 0.1, + ) + except ValueError: + ( + updated_page_anchor, + updated_text_anchor, + mentiontext, + match_str_pair, + method, + map_text_list, + ) = get_updated_entity( + _child_ent, + orig_invoice_doc, + translation_api_output, + english_page_num, + diff_y, + diff_x, + ) + nvs = get_normalized_vertices(updated_page_anchor) + _bounding_poly = documentai.BoundingPoly( + normalized_vertices=nvs + ) + page_refs = documentai.Document.PageAnchor.PageRef( + page=english_page_num, bounding_poly=_bounding_poly + ) + _pa = documentai.Document.PageAnchor(page_refs=[page_refs]) + _child_property = documentai.Document.Entity( + confidence=1, + mention_text=mentiontext, + type_=_child_ent.type_, + page_anchor=_pa, + ) + _child_ent.text_anchor.content = mentiontext + if _child_ent.normalized_value: + del _child_ent.normalized_value + _child_properties.append(_child_property) + _updated_text_anchor = updated_text_anchor[0] + _parent_text_segments.extend([_updated_text_anchor]) + for norm_ver in _bounding_poly.normalized_vertices: + _parent_x_y["x"].append(norm_ver.x) + _parent_x_y["y"].append(norm_ver.y) + df.loc[len(df.index)] = [ + child_ent_eng_type, + child_ent_eng_mt.strip("\n"), + mentiontext.strip("\n"), + match_str_pair, + method, + map_text_list, + child_ent_eng_bbox, + [ + updated_page_anchor["min_x"], + updated_page_anchor["min_y"], + updated_page_anchor["max_x"], + updated_page_anchor["max_y"], + ], + lang, + ] + except ValueError: + df.loc[len(df.index)] = [ + child_ent_eng_type, + child_ent_eng_mt.strip("\n"), + "", + "", + "", + "", + "", + "", + lang, + ] + _sorted_parent_text_segments = sorted( + _parent_text_segments, key=lambda x: int(x.end_index) + ) + _parent_mention_text = "" + for ts in _sorted_parent_text_segments: + si = ts.start_index + ei = ts.end_index + temp = orig_invoice_doc.text[si:ei] + _parent_mention_text += f" {temp}" + parent_page_anchor = { + "min_x": min(_parent_x_y["x"]), + "min_y": min(_parent_x_y["y"]), + "max_x": max(_parent_x_y["x"]), + "max_y": max(_parent_x_y["y"]), + } + nvs = get_normalized_vertices(parent_page_anchor) + _parent_bounding_poly = documentai.BoundingPoly(normalized_vertices=nvs) + page_refs = documentai.Document.PageAnchor.PageRef( + page=english_page_num, bounding_poly=_parent_bounding_poly + ) + _pa = documentai.Document.PageAnchor(page_refs=[page_refs]) + _parent_entity = documentai.Document.Entity( + confidence=1, + id=_entity.id, + type_=_entity.type_, + mention_text=_parent_mention_text, + properties=_child_properties, + page_anchor=_pa, + ) + _ts = updated_text_anchor[0] + _parent_entity.text_anchor.text_segments = [_ts] + _updated_entities.append(_parent_entity) + except (IndexError, ValueError): + ent_t = _entity.type_ + ent_mt = _entity.mention_text + ent_eng_bbox12: Tuple[float, float, float, float] = (0.0, 0.0, 0.0, 0.0) + pgrfs = _entity.page_anchor.page_refs[0] + bounding_box = pgrfs.bounding_poly.normalized_vertices + if bounding_box: + ent_eng_bbox12 = get_min_max_x_y(bounding_box) + df.loc[len(df.index)] = [ + ent_t, + ent_mt.strip("\n"), + "", + "", + "", + "", + ent_eng_bbox12, + "", + lang, + ] + for index, entity in enumerate(_updated_entities): + entity.id = str(index) + if entity.properties: + for child_index, child_entity in enumerate(entity.properties): + child_entity.id = str(child_index) + orig_invoice_doc.entities = _updated_entities + return df, orig_invoice_doc diff --git a/incubator-tools/backmapping_entities_from_parser_output_to_original_language/backmapping_entities_from_parser_output_to_original_language.ipynb b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/backmapping_entities_from_parser_output_to_original_language.ipynb new file mode 100644 index 000000000..112da65a2 --- /dev/null +++ b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/backmapping_entities_from_parser_output_to_original_language.ipynb @@ -0,0 +1,355 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bb1a4e07-a39a-47a0-b663-c5a2a43c30f9", + "metadata": {}, + "source": [ + "# Backmapping Entities From Parser Output To Original Language" + ] + }, + { + "cell_type": "markdown", + "id": "5c3e8f40-015e-489d-8bed-0ee3633abb93", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "5b7d1192-139d-4cbf-96cc-e63d0a873ad9", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "1ba5fab9-2787-4ced-8d80-67648f13e850", + "metadata": {}, + "source": [ + "## Objective\n", + "This document guides to backmap the entities from the parser output which is trained in different languages to the original language of the document using google translation API.\n" + ] + }, + { + "cell_type": "markdown", + "id": "5f911bba-77b0-4b86-81e0-364b0b55ef2d", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "* Vertex AI Notebook\n", + "* Documents in GCS folder to backmap\n", + "* Parser details\n", + "* `textUnits` option for **Cloud Translation API** needs to be allowlisted/enabled for project" + ] + }, + { + "cell_type": "markdown", + "id": "435dee2b-76d7-4c02-ad6e-7e3996008b90", + "metadata": {}, + "source": [ + "## Workflow to BackMap the Entities to Original language\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "952e913d-7e88-4cd3-beb2-33d196c11d3d", + "metadata": {}, + "source": [ + "## Step-by-Step Procedure" + ] + }, + { + "cell_type": "markdown", + "id": "4a6ba8f2-de6b-4a6a-b960-4df92704a486", + "metadata": {}, + "source": [ + "## 1. Import Modules/Packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c02b7b6f-74c8-470c-8503-0005b281beb2", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install fuzzywuzzy -q\n", + "!pip install google-auth -q\n", + "!pip install google-cloud-documentai -q\n", + "!pip install google-cloud-storage -q\n", + "!pip install numpy -q\n", + "!pip install opencv-python -q\n", + "!pip install pandas -q\n", + "!pip install pillow -q\n", + "!pip install python-dateutil -q\n", + "!pip install requests -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7270f808-bbc3-44d6-855d-c7a916200049", + "metadata": {}, + "outputs": [], + "source": [ + "# Run this cell to download utilities module\n", + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b98ecf2-c6ec-45bf-a712-c87bce7c26c0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from backmap_utils import (\n", + " document_to_json,\n", + " download_pdf,\n", + " process_document,\n", + " run_consolidate,\n", + " translation_text_units,\n", + " upload_to_cloud_storage,\n", + ")\n", + "from utilities import file_names" + ] + }, + { + "cell_type": "markdown", + "id": "072bb930-6039-42d1-9087-3e45751ab346", + "metadata": {}, + "source": [ + "## 2. Input Details" + ] + }, + { + "cell_type": "markdown", + "id": "bad7e1b4-6de1-4fc7-aa4f-41ac2a02ecb4", + "metadata": {}, + "source": [ + "* **PROJECT_ID**: GCP project ID\n", + "* **LOCATION**: Location of DocumentAI Processor, either `us` or `eu`\n", + "* **PROCESSOR_ID**: DocumentAI Parser ProcessorID \n", + "* **PROCESSOR_VERSION_ID**: DocumentAI Parser processor version id\n", + "* **ORIGINAL_SAMPLES_GCS_PATH**: GCS folder apth containing native-language(non-english) documents\n", + "* **OUTPUT_BUCKET**: GCS output bucket-name to store results(with-out gs://)\n", + "* **OUTPUT_GCS_DIR**: Output folder path to store results in above mentioned output-bucket(with-out gs://)\n", + "* **MIME_TYPE**: Mimetype of input documents\n", + "* **TRANSLATION**: `True` if you needed translation of documents from non-eng to english language, otherwise `False`\n", + "* **BACKMAPPING**: `True` if you needed backamapping of entities from parser-output to original language(non-english), otherwise `False`\n", + "* **SAVE_TRANSLATED_PDF**: `True` if you need to store translated doc-results of Cloud Translation API output results\n", + "* **ORIGINAL_LANGUAGE**: Provide language code of original documents. eg:- '`de`' for greek input files\n", + "* **TARGET_LANGUAGE**: Provide target language code. eg:- '`en`' to sonvert to english\n", + "* **DIFF_X**: X-coordinate offset\n", + "* **DIFF_Y**: Y-coordinate offset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf4b3571-d01c-421b-8a0f-91dbb95d38e3", + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT_ID = \"xx-xx-xx\"\n", + "LOCATION = \"us\" # or \"eu\"\n", + "PROCESSOR_ID = \"xx-xx-xx\" # Invoice processor ID\n", + "PROCESSOR_VERSION_ID = \"pretrained-invoice-v1.3-2022-07-15\"\n", + "ORIGINAL_SAMPLES_GCS_PATH = \"gs://bucket/path_to/backmapping/original_samples\"\n", + "OUTPUT_BUCKET = \"bucket_name_only\" # without gs://\n", + "OUTPUT_GCS_DIR = \"directory_name\" # without gs://\n", + "MIME_TYPE = \"application/pdf\"\n", + "TRANSLATION = True\n", + "BACKMAPPING = True\n", + "SAVE_TRANSLATED_PDF = True\n", + "ORIGINAL_LANGUAGE = \"de\"\n", + "TARGET_LANGUAGE = \"en\"\n", + "DIFF_X = 0.3\n", + "DIFF_Y = 0.05" + ] + }, + { + "cell_type": "markdown", + "id": "9986285a-e9c5-40bc-8ce7-4f35b1d40a59", + "metadata": {}, + "source": [ + "## 3. Run Below Code-Cells" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b70f2ac3-6dea-4aea-8cbf-2df25259ff7a", + "metadata": {}, + "outputs": [], + "source": [ + "files_list, files_dict = file_names(ORIGINAL_SAMPLES_GCS_PATH)\n", + "input_bucket_name = ORIGINAL_SAMPLES_GCS_PATH.split(\"/\")[2]\n", + "OUTPUT_GCS_DIR = OUTPUT_GCS_DIR.strip(\"/\")\n", + "df_merge = pd.DataFrame()\n", + "print(\n", + " \"Backmapping DocumentAI Parser Output to it's Original Language Process Started...\"\n", + ")\n", + "path_text_units = f\"{OUTPUT_GCS_DIR}/text_units\"\n", + "path_after_translation = f\"{OUTPUT_GCS_DIR}/after_translation\"\n", + "path_after_backmapping = f\"{OUTPUT_GCS_DIR}/after_backmapping\"\n", + "PATH_CONSOLIDATED_CSV = OUTPUT_GCS_DIR\n", + "CONSOLIDATED_CSV = \"consolidated_csv_after_backamapping.csv\"\n", + "for fn, fp in files_dict.items():\n", + " print(f\"File: {fn}\")\n", + " gcs_input_path = f\"gs://{input_bucket_name}/{fp}\"\n", + " pdf_bytes_target = download_pdf(gcs_input_path, fp) # .getvalue()\n", + " # converting non-eng-doc(greek) pdf to docai-json result using invoice-v3\n", + " print(\"\\tDocumentAI process sync-started for raw-document\")\n", + " target_docai_result = process_document(\n", + " PROJECT_ID,\n", + " LOCATION,\n", + " PROCESSOR_ID,\n", + " PROCESSOR_VERSION_ID,\n", + " file_content=pdf_bytes_target,\n", + " mime_type=MIME_TYPE,\n", + " is_native=False,\n", + " ocr=False,\n", + " )\n", + " json_data_target = document_to_json(target_docai_result)\n", + " filename = fn.split(\".\")[0]\n", + " if TRANSLATION:\n", + " print(\"\\t\\tTranslation process started...\")\n", + " input_uri = f\"gs://{OUTPUT_BUCKET}/{fp}\"\n", + " pdf_bytes_source, text_units, json_response = translation_text_units(\n", + " PROJECT_ID,\n", + " LOCATION,\n", + " PROCESSOR_VERSION_ID,\n", + " PROCESSOR_ID,\n", + " TARGET_LANGUAGE,\n", + " ORIGINAL_LANGUAGE,\n", + " input_uri,\n", + " OUTPUT_BUCKET,\n", + " OUTPUT_GCS_DIR,\n", + " save_translated_doc=SAVE_TRANSLATED_PDF,\n", + " )\n", + " text_units_dict = {\"text_units\": text_units}\n", + " upload_to_cloud_storage(\n", + " filename, text_units_dict, OUTPUT_BUCKET, path_text_units\n", + " )\n", + " print(\"\\tDocumentAI process sync-started for translated-document(English)\")\n", + " source_docai_result = process_document(\n", + " PROJECT_ID,\n", + " LOCATION,\n", + " PROCESSOR_ID,\n", + " PROCESSOR_VERSION_ID,\n", + " file_content=pdf_bytes_source,\n", + " mime_type=MIME_TYPE,\n", + " is_native=False,\n", + " ocr=False,\n", + " )\n", + " json_data_source = document_to_json(source_docai_result)\n", + " upload_to_cloud_storage(\n", + " filename, json_data_source, OUTPUT_BUCKET, path_after_translation\n", + " )\n", + "\n", + " if TRANSLATION and BACKMAPPING:\n", + " # Consolidate the extracted and processed data\n", + " print(\"\\t\\tBackmapping process started...\")\n", + " df, target_json = run_consolidate(\n", + " source_docai_result,\n", + " target_docai_result,\n", + " text_units,\n", + " DIFF_X,\n", + " DIFF_Y,\n", + " ORIGINAL_LANGUAGE,\n", + " )\n", + " target_json = document_to_json(target_json)\n", + " upload_to_cloud_storage(\n", + " filename, target_json, OUTPUT_BUCKET, path_after_backmapping\n", + " )\n", + " df.insert(loc=0, column=\"File Name\", value=filename)\n", + " df_merge = pd.concat([df_merge, df])\n", + "\n", + "upload_to_cloud_storage(\n", + " CONSOLIDATED_CSV, df_merge, OUTPUT_BUCKET, PATH_CONSOLIDATED_CSV\n", + ")\n", + "print(\"Process Completed!!!\")" + ] + }, + { + "cell_type": "markdown", + "id": "efb135cc-22f3-4aa6-adc6-cec5841cb997", + "metadata": {}, + "source": [ + "## 4. Output Details" + ] + }, + { + "cell_type": "markdown", + "id": "a77dc16a-a62d-4292-8f31-acada6295d28", + "metadata": {}, + "source": [ + "1. Raw Document sample(Greek PDF sample) \n", + "
\n", + "\n", + "2. After Translation from Greek to English using Cloud Translation API\n", + " \\n\n", + "\n", + "3. After using Translation API, every translated document contains `Machine Translated By Google` text at top-left conrner of translated page\n", + " \n", + "\n", + "4. Sample CSV output file data for comparision between original document entities mention text and translated document mention text\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c43c9cc-55f0-46fb-acf6-45e75935cf1a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68c96be6-85fe-466c-8d62-6f7ea3c0d079", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/after_translation_greek_to_eng.png b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/after_translation_greek_to_eng.png new file mode 100644 index 000000000..a9c146cbe Binary files /dev/null and b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/after_translation_greek_to_eng.png differ diff --git a/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/df_comparision_output.png b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/df_comparision_output.png new file mode 100644 index 000000000..97e964484 Binary files /dev/null and b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/df_comparision_output.png differ diff --git a/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/original_doc_greek.png b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/original_doc_greek.png new file mode 100644 index 000000000..152a3d979 Binary files /dev/null and b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/original_doc_greek.png differ diff --git a/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/redact_noise_after_translation.png b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/redact_noise_after_translation.png new file mode 100644 index 000000000..46aa0a5cf Binary files /dev/null and b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/redact_noise_after_translation.png differ diff --git a/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/workflow.png b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/workflow.png new file mode 100644 index 000000000..a7854719e Binary files /dev/null and b/incubator-tools/backmapping_entities_from_parser_output_to_original_language/images/workflow.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/README.md b/incubator-tools/bank_statement_post_processing_tool/README.md new file mode 100644 index 000000000..9b96420ec --- /dev/null +++ b/incubator-tools/bank_statement_post_processing_tool/README.md @@ -0,0 +1,39 @@ +# Purpose and Description + +This tool is designed to take bank statements from Google Cloud Storage (GCS) and parse them via a DocAI bank statement processor and post process the response from the parser (post processing as per gatless requirement in project specs) then provide the output in json format. + +Below are the steps the tool will follow : + +* 1.Bank statements are parsed through the bank statement processor. +* 2.Post-processing the response from the bank statement processor and saving the result in json format in the bucket. +* 3.This scipt includes the option to parse checks for the top three banks, where it is required to train a CDE model. +* 4.Top Three Banks: WellsFargo, Bank Of America, Chase + +images2 + +## Input Details + +* **Project name**: Enter the google cloud project name. +* **Project_Id**: Enter the google cloud project id. +* **Processor_Id**: Enter the bank statement processor id. +* **gcs_input_dir**: Enter the path of files which have to be parsed. +* **gcs_output_dir**: Enter the path of files where you want to save the output jsons after processing the files to bank statement parser. +* **gcs_new_output_json_path**: Enter the path where the post processed json files have to be saved. +* **checksFlag** : True. Checks Flag, if True, It will use CDE Model to parse the checks table + +Update the checksFlag as True if you need checks to be parsed thru cde trained parser else it can be marked as False + +Fill the below details only the checksFlag is TRUE else not needed + +* **Processor_id_checks**: Enter the CDE trained processor id. +* **Processor_version_checks**: Enter the CDE trained processor version. + +## Output Details + +The Trained processor will detect the check entities but with the characteristic of detecting the whole row as a parent item(check_item) , if there are multiple tables (horizontally stacked). + +image3 + +The above issue is taken care of in the post processing code and below is the output after post processing code. + +image8 diff --git a/incubator-tools/bank_statement_post_processing_tool/bank_statement_post_processing_tool.ipynb b/incubator-tools/bank_statement_post_processing_tool/bank_statement_post_processing_tool.ipynb new file mode 100644 index 000000000..3e5dc6b4e --- /dev/null +++ b/incubator-tools/bank_statement_post_processing_tool/bank_statement_post_processing_tool.ipynb @@ -0,0 +1,2841 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c335127a-4cf6-4c9a-ae49-b5daa2e8ea0b", + "metadata": {}, + "source": [ + "# Bank Statement Post-Processing Guide" + ] + }, + { + "cell_type": "markdown", + "id": "ed350af2-ecbd-4a50-9073-44b27f493c59", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "a7a8065c-5783-46b7-a41a-6f30e7625f53", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "68fca907-6254-4962-aac6-01de009c906c", + "metadata": {}, + "source": [ + "## Purpose and Description\n", + "\n", + "

This tool is designed to take bank statements from Google Cloud Storage (GCS) and parse them via a DocAI bank statement processor and post process the response from the parser (post processing as per gatless requirement in project specs) then provide the output in json format.

\n", + "

\n", + "

Below are the steps the tool will follow

\n", + "

\n", + "
    \n", + "
  1. Bank statements are parsed through the bank statement processor.
  2. \n", + "
  3. Post-processing the response from the bank statement processor and saving the result in json format in the bucket.
  4. \n", + "
  5. This scipt includes the option to parse checks for the top three banks, where it is required to train a CDE model.
  6. \n", + "
  7. Top Three Banks: WellsFargo, Bank Of America, Chase
  8. \n", + "
\n", + "

\n", + " \"\"\n", + "

Bank Statement Parser working chart 

\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "4f334e64-d249-4bd2-8699-4eec1be943a0", + "metadata": {}, + "source": [ + "# Installation Guide\n", + "This Bank Statement Parser is in a Python notebook script format which can be used in Vertex AI JupyterLab Environment . First, put the Bank Statement script in JupyterLab and then, put all the reference documents in a specific folder . Also, create or use an existing folder as an output folder. " + ] + }, + { + "cell_type": "markdown", + "id": "7821eb9d-cd7b-4db4-828c-f1d16fd32191", + "metadata": {}, + "source": [ + "# step 1 : Installing modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c238e8e-e9bd-4bb4-8eec-1fffd3203ddf", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install deepparse\n", + "%pip install google.cloud\n", + "%pip install dataclasses\n", + "%pip install difflib\n", + "%pip install pandas\n", + "%pip install numpy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66ea65cf-cca5-4a62-85cf-4c324b52f53e", + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "markdown", + "id": "41097325-6a38-4be2-925b-b09531de6706", + "metadata": { + "tags": [] + }, + "source": [ + "# Step by Step procedure\n", + "
NOTE: Stepwise status can be seen in “logging.txt”

\n", + "\n", + " \"\"\n", + "

Bank Statement Parser logging.txt

\n", + "

Step 1: Create a Bank Statement Parser from Processor Gallery (Workbench).


\n", + "

Step 1.2:

Make pretrained-bankstatement-v3.0-2022-05-16 as a default Processor Version.\n" + ] + }, + { + "cell_type": "markdown", + "id": "0f237c75-b139-4d4d-a47a-0017e9fed4b3", + "metadata": {}, + "source": [ + "

Step 3 : Input Parameters

  Fill the details in for Project and GCS folder enter the path of the input files in the Code.

\n", + " \"\"\n", + "
    \n", + "
  • Project name: Enter the google cloud project name
  • \n", + "
  • Project_Id: Enter the google cloud project id
  • \n", + "
  • Processor_Id: Enter the bank statement processor id
  • \n", + "
  • gcs_input_dir: Enter the path of files which have to be parsed
  • \n", + "
  • gcs_output_dir: Enter the path of files where you want to save the output jsons after processing the files to bank statement parser
  • \n", + "
  • gcs_new_output_json_path: Enter the path where the post processed json files have to be saved
  • \n", + "
\n", + "

\n", + "

If Checks has to be parsed then fill the below details of CDE Trained processor

\n", + "

\n", + "
    \n", + "
  • checksFlag: True
  • \n", + "
\n", + "

Update the checksFlag as True if you need checks to be parsed thru cde trained parser else it can be marked as False

\n", + "

Fill the below details only  the checksFlag is TRUE else not needed

\n", + "

\n", + "
    \n", + "
  • Processor_id_checks: Enter the CDE trained processor id
  • \n", + "
  • Processor_version_checks: Enter the CDE trained processor version
  • \n", + "
\n", + "
\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c1b8a545-e631-45af-a8db-738fe2d9fc6d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# input details\n", + "project_name = \"xxxxxxx\" # project name\n", + "project_id = \"xxxxxxxx\" # project number\n", + "processor_id = \"xxxxxxxxxx\" # processor id\n", + "gcs_input_dir = \"gs://xxxxxxx/xxxxxxx/xxxxxxxx/input_pdfs\" # input documents path\n", + "gcs_output_dir = \"gs://xxxxxxx/xxxxxxx/xxxxxxxxxx/processor_output\" # output documents for async parsing, suggested to use a diff bucket than ‘gcs_input_dir’\n", + "gcs_new_output_json_path = \"gs://xxxxxx/xxxxxx/xxxxxx/pp_output/\" # post process json path, , suggested to use a diff bucket than ‘gcs_input_dir’\n", + "### To Parse Checks Table Items, Please Train a CDE Model , and provide CDE processorID and processorVersionID below, and set checksFlag=True\n", + "checksFlag = (\n", + " True # Checks Flag, if True, It will use CDE Model to parse the checks table\n", + ")\n", + "processor_id_checks = \"xxxxxxx\" # CDE processor_id\n", + "processor_version_checks = \"xxxxxx\" # CDE processor_version_id" + ] + }, + { + "cell_type": "markdown", + "id": "6a4b6fea-1d55-450e-ac3d-714dba7285d9", + "metadata": {}, + "source": [ + "### **Step 4 :**\n", + "\n", + "Processing and Post processing the documents\\[run these cells without editing anything\\]\n", + "\n", + "\n", + "\n", + "If consolidated CSV is needed please uncomment the below area for CSV generation.\n", + "\n", + "\n", + "\n", + "Cheques Entity Detection:(currently considered only for top 3 banks)\n", + "\n", + "1. Trained a CDE model for cheque entity detection , which further needs to be post processed and combined with the bank statement parser post processed json file.\n", + "\n", + "2. We has to train a CDE model as below: \\[Refer [DocAI Workbench CDE Guide](https://www.google.com/url?q=https://cloud.google.com/document-ai/docs/workbench/build-custom-processor&sa=D&source=editors&ust=1704297767379409&usg=AOvVaw02t9j3qCz7KaDInejsTV3s) to setup a CDE processor\\]\n", + "\n", + " * Create a CDE parser with the below schema\n", + " * Select and Label documents which have Check details. Use the below convention and train the processor using those documents. Example Training instructions (illustration) given below.\n", + "\n", + "Incubator Implementation Notes:\n", + "\n", + "* For Check Description (check\\_desc), Incubator Team didn’t have any sample, so disabled that entity while training the CDE model.\n", + "* For the POC, 45 Training and 15 Test Documents were used.\n", + "* Getting ~90 percent accuracy for the top 3 banks.\n", + "\n", + "**Schema**:\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "

#

\n", + "
\n", + "

Description of the item

\n", + "
\n", + "

Entity name

\n", + "
\n", + "

Occurrence Type

\n", + "
\n", + "

1.

\n", + "
\n", + "

Total line item (includes Check Number, check date and check amount, check description)  [Parent]

\n", + "
\n", + "

check_item

\n", + "
\n", + "

Optional multiple

\n", + "
\n", + "

2.

\n", + "
\n", + "

Check number [child]

\n", + "
\n", + "

check_number

\n", + "
\n", + "

Optional once

\n", + "
\n", + "

3.

\n", + "
\n", + "

Check date [child]

\n", + "
\n", + "

check_date

\n", + "
\n", + "

Optional once

\n", + "
\n", + "

4

\n", + "
\n", + "

Check amount [child]

\n", + "
\n", + "

check_amount

\n", + "
\n", + "

Optional once

\n", + "
\n", + "

5.

\n", + "
\n", + "

Check Description [child]

\n", + "
\n", + "

check_desc

\n", + "
\n", + "

Optional once

\n", + "
\n", + "\n", + "* *Sample Schema* \n", + "
\n", + " \n", + "\n", + "\n", + "* Blue boxes are the labeled bounding boxes \n", + "
\n", + "\n", + "\n", + "\n", + "* The Trained processor will detect the check entities but with the characteristic of detecting the whole row as a parent item(check\\_item) , if there are multiple tables (horizontally stacked). \n", + "
\n", + "\n", + "\n", + "* The above issue is taken care of in the post processing code and below is the output after post processing code. \n", + "
\n", + "\n", + "\n", + "3. The Post processing code for modifying the CDE output and combining with the Bank statement parser post processing json is given below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db54aeac-5f62-4387-ac16-06d1f5a75295", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import storage\n", + "import json\n", + "import re\n", + "import pandas as pd\n", + "import copy\n", + "import os\n", + "import random\n", + "import string\n", + "from dataclasses import dataclass\n", + "from difflib import SequenceMatcher\n", + "import numpy as np\n", + "from io import BytesIO\n", + "from deepparse.parser import AddressParser\n", + "from datetime import datetime\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "from typing import Any, Dict, List, Optional, Sequence, Tuple, Union\n", + "from utilities import (\n", + " file_names,\n", + " documentai_json_proto_downloader,\n", + " copy_blob,\n", + " process_document_sample,\n", + " store_document_as_json,\n", + " batch_process_documents_sample,\n", + " blob_downloader,\n", + " create_pdf_bytes_from_json,\n", + ")\n", + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "\n", + "# finds maximum id\n", + "def maxIdFinder(jsonData: documentai.Document) -> int:\n", + " \"\"\"\n", + " Function to get the maximum id from the entity id attribute..\n", + "\n", + " Parameters\n", + " ----------\n", + " jsonData : documentai.Document\n", + " The document proto having all the entities with the id attribute.\n", + " Returns\n", + " -------\n", + " int\n", + " Returns the maximum id.\n", + " \"\"\"\n", + " global maxId\n", + "\n", + " allEntities = jsonData.entities\n", + " noOfEntitiesInJsonFile = len(allEntities)\n", + " jsonDict = {\n", + " \"confidence\": [],\n", + " \"id\": [],\n", + " \"mention_text\": [],\n", + " \"normalized_value\": [],\n", + " \"page_anchor\": [],\n", + " \"text_anchor\": [],\n", + " \"type\": [],\n", + " }\n", + " entitiesArray = []\n", + "\n", + " for i in range(0, noOfEntitiesInJsonFile):\n", + " try:\n", + " if allEntities[i].id:\n", + " entitiesArray.append(allEntities[i])\n", + " except:\n", + " for j in allEntities[i].properties:\n", + " entitiesArray.append(j)\n", + "\n", + " entitiesArray = sorted(entitiesArray, key=lambda x: x.id)\n", + "\n", + " for i in range(0, len(entitiesArray)):\n", + " try:\n", + " jsonDict[\"confidence\"].append(entitiesArray[i].confidence)\n", + " except:\n", + " jsonDict[\"confidence\"] = 0\n", + " try:\n", + " jsonDict[\"id\"].append(entitiesArray[i].id)\n", + " except:\n", + " jsonDict[\"id\"].append(\"\")\n", + " try:\n", + " jsonDict[\"mention_text\"].append(entitiesArray[i].mention_text)\n", + " except:\n", + " jsonDict[\"mention_text\"].append(\"\")\n", + " try:\n", + " jsonDict[\"normalized_value\"].append(entitiesArray[i].normalized_value)\n", + " except:\n", + " jsonDict[\"normalized_value\"].append(\"\")\n", + " try:\n", + " jsonDict[\"page_anchor\"].append(entitiesArray[i].page_anchor)\n", + " except:\n", + " jsonDict[\"page_anchor\"].append(\"\")\n", + " try:\n", + " jsonDict[\"text_anchor\"].append(entitiesArray[i].text_anchor)\n", + " except:\n", + " jsonDict[\"text_anchor\"].append(\"\")\n", + " try:\n", + " jsonDict[\"type\"].append(entitiesArray[i].type)\n", + " except:\n", + " jsonDict[\"type\"].append(\"\")\n", + "\n", + " tempList = []\n", + " for i in jsonDict[\"id\"]:\n", + " tempList.append(int(i))\n", + " maxId = max(tempList)\n", + " return maxId\n", + "\n", + "\n", + "def delete_folder(bucket_name: str, folder_name: str) -> None:\n", + " \"\"\"\n", + " Function to delete the folder in a given bucket.\n", + "\n", + " Parameters\n", + " ----------\n", + " bucket_name : str\n", + " The bucket name where all the folder are stored.\n", + " folder_name : str\n", + " The folder name which needs to be removed.\n", + " \"\"\"\n", + "\n", + " storage_client = storage.Client()\n", + " bucket = storage_client.get_bucket(bucket_name)\n", + " \"\"\"Delete object under folder\"\"\"\n", + " blobs = list(bucket.list_blobs(prefix=folder_name))\n", + " bucket.delete_blobs(blobs)\n", + " print(f\"Folder {folder_name} deleted.\")\n", + "\n", + "\n", + "def get_files_not_parsed(gcs_input_dir: str, gcs_output_dir: str) -> Tuple:\n", + " \"\"\"\n", + " Function to get the file which are not processed by the processor.\n", + "\n", + " Parameters\n", + " ----------\n", + " gcs_input_dir : str\n", + " The gcs path where the original documents(PDFs) are stored.\n", + " gcs_output_dir : str\n", + " The gcp path to store the output from the processor.\n", + " Returns\n", + " -------\n", + " Tuple\n", + " Returns the Tuple with values of temporary folder name, temporary bucket name and temporary initial path\n", + " \"\"\"\n", + " now = datetime.now()\n", + " OutputDirPrefix = now.strftime(\"%H%M%S%d%m%Y\")\n", + " pdfs_names_list, pdfs_names_dict_1 = file_names(gcs_input_dir)\n", + " Jsons_names_list, Jsons_names_dict_1 = file_names(gcs_output_dir)\n", + " file_name_dict = {a.split(\".\")[0]: a for a in pdfs_names_list}\n", + " json_name_dict = {a.split(\".\")[0]: a for a in Jsons_names_list}\n", + " files_list = list(file_name_dict.keys())\n", + " list_json_name_dict = list(json_name_dict.keys())\n", + " dict_json = {}\n", + " for i in range(len(list_json_name_dict)):\n", + " if list_json_name_dict[i].endswith(\"-0\"):\n", + " dict_json[(list_json_name_dict[i][:-2])] = list_json_name_dict[i]\n", + " else:\n", + " dict_json[(list_json_name_dict[i])] = list_json_name_dict[i]\n", + " temp_bucket = gcs_input_dir.split(\"/\")[2]\n", + " storage_client = storage.Client()\n", + " source_bucket = storage_client.get_bucket(temp_bucket)\n", + " list_new = []\n", + " for i in range(len(files_list)):\n", + " if files_list[i] in dict_json.keys():\n", + " print(\n", + " \" Processed json file already exists for:{} \".format(\n", + " file_name_dict[files_list[i]]\n", + " )\n", + " )\n", + " else:\n", + " list_new.append(files_list[i])\n", + " source_blob = source_bucket.blob(file_name_dict[files_list[i]])\n", + " # print(file_name_dict[files_list[i]])\n", + " temp = f\"{file_name_dict[files_list[i]]}\"\n", + " file_name_temp = pdfs_names_dict_1[temp]\n", + " prefix = (\n", + " gcs_input_dir.split(\"/\")[-1]\n", + " + \"/\"\n", + " + \"temp_\"\n", + " + f\"{OutputDirPrefix}\"\n", + " + \"/\"\n", + " + temp\n", + " )\n", + " # new_blob = source_bucket.copy_blob(source_blob, destination_bucket, filename[i])\n", + " copy_blob(temp_bucket, file_name_temp, temp_bucket, prefix)\n", + " temp_initial_path = (\n", + " \"gs://\"\n", + " + temp_bucket\n", + " + \"/\"\n", + " + gcs_input_dir.split(\"/\")[-1]\n", + " + \"/\"\n", + " + \"temp_\"\n", + " + f\"{OutputDirPrefix}\"\n", + " )\n", + " temp_folder = (\n", + " (\"/\").join(gcs_input_dir.split(\"/\")[3:])\n", + " + \"/\"\n", + " + \"temp_\"\n", + " + f\"{OutputDirPrefix}\"\n", + " + \"/\"\n", + " )\n", + " return temp_initial_path, temp_folder, temp_bucket\n", + "\n", + "\n", + "def get_files_not_postparsed(\n", + " gcs_output_dir: str, gcs_new_output_json_path: str\n", + ") -> Tuple:\n", + " \"\"\"\n", + " Function to get the file which are not processed by the script.\n", + "\n", + " Parameters\n", + " ----------\n", + " gcs_output_dir : str\n", + " The gcs path where the processed documents are available which are already been parsed by processor.\n", + " gcs_new_output_json_path : str\n", + " The gcp path to store the output from this script.\n", + " Returns\n", + " -------\n", + " Tuple\n", + " Returns the Tuple with values of temporary folder name, temporary bucket name and temporary initial path\n", + " \"\"\"\n", + " now = datetime.now()\n", + " OutputDirPrefix = now.strftime(\"%H%M%S%d%m%Y\")\n", + " pdfs_names_list, pdfs_names_dict_1 = file_names(gcs_output_dir)\n", + " Jsons_names_list, Jsons_names_dict_1 = file_names(gcs_new_output_json_path)\n", + " file_name_dict = {a.split(\".\")[0]: a for a in pdfs_names_list}\n", + " json_name_dict = {a.split(\".\")[0]: a for a in Jsons_names_list}\n", + " files_list = list(file_name_dict.keys())\n", + " temp_bucket = gcs_output_dir.split(\"/\")[2]\n", + " storage_client = storage.Client()\n", + " source_bucket = storage_client.get_bucket(temp_bucket)\n", + " list_new = []\n", + " for i in range(len(files_list)):\n", + " if files_list[i] in json_name_dict.keys():\n", + " print(\n", + " \" Processed json file already exists for:{} \".format(\n", + " file_name_dict[files_list[i]]\n", + " )\n", + " )\n", + " else:\n", + " list_new.append(files_list[i])\n", + " source_blob = source_bucket.blob(file_name_dict[files_list[i]])\n", + " # print(file_name_dict[files_list[i]])\n", + " temp = f\"{file_name_dict[files_list[i]]}\"\n", + " file_name_temp = pdfs_names_dict_1[temp]\n", + " prefix = (\n", + " gcs_output_dir.split(\"/\")[-1]\n", + " + \"/\"\n", + " + \"temp_\"\n", + " + f\"{OutputDirPrefix}\"\n", + " + \"/\"\n", + " + temp\n", + " )\n", + " # new_blob = source_bucket.copy_blob(source_blob, destination_bucket, filename[i])\n", + " copy_blob(temp_bucket, file_name_temp, temp_bucket, prefix)\n", + " temp_initial_path = (\n", + " \"gs://\"\n", + " + temp_bucket\n", + " + \"/\"\n", + " + gcs_output_dir.split(\"/\")[-1]\n", + " + \"/\"\n", + " + \"temp_\"\n", + " + f\"{OutputDirPrefix}\"\n", + " )\n", + " temp_folder = (\n", + " (\"/\").join(gcs_output_dir.split(\"/\")[3:])\n", + " + \"/\"\n", + " + \"temp_\"\n", + " + f\"{OutputDirPrefix}\"\n", + " + \"/\"\n", + " )\n", + " return temp_initial_path, temp_folder, temp_bucket\n", + "\n", + "\n", + "# dictionary for entity renaming\n", + "dict_ent_rename = {\n", + " \"statement_start_date\": \"Statement_Start_Date\",\n", + " \"statement_end_date\": \"Statement_End_Date\",\n", + " \"bank_name\": \"Financial_Institution\",\n", + "}\n", + "\n", + "\n", + "def accounttype_change(json_data: documentai.Document) -> documentai.Document:\n", + " \"\"\"\n", + " Function is for comparing sequences for the account entity (account_type, account number) and update the entity name.\n", + "\n", + " Parameters\n", + " ----------\n", + " document :documentai.Document\n", + " The document proto having all the entities\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto .\n", + " \"\"\"\n", + " import difflib\n", + " from difflib import SequenceMatcher\n", + "\n", + " accountnodict = {}\n", + " accountnamedict = {}\n", + "\n", + " def detials_account(account_type):\n", + " account_dict_lst = []\n", + " for i in range(len(json_data.entities)):\n", + " if not hasattr(json_data.entities[i], \"properites\"):\n", + " if (\n", + " difflib.SequenceMatcher(\n", + " None, json_data.entities[i].type, account_type\n", + " ).ratio()\n", + " >= 0.9\n", + " ):\n", + " try:\n", + " id1 = json_data.entities[i].id\n", + " except:\n", + " id1 = \"\"\n", + " try:\n", + " page1 = json_data.entities[i].page_anchor.page_refs[0].page\n", + " except:\n", + " page1 = 0\n", + " try:\n", + " textSegments1 = json_data.entities[i].text_anchor.text_segments[\n", + " 0\n", + " ]\n", + " except:\n", + " textSegments1 = \"\"\n", + " try:\n", + " temp_y_list = []\n", + " temp_x_list = []\n", + " for j in (\n", + " json_data.entities[i]\n", + " .page_anchor.page_refs[0]\n", + " .bounding_poly.normalized_vertices\n", + " ):\n", + " temp_y_list.append(float(j.y))\n", + " for j in (\n", + " json_data.entities[i]\n", + " .page_anchor.page_refs[0]\n", + " .bounding_poly.normalized_vertices\n", + " ):\n", + " temp_x_list.append(float(j.x))\n", + " x_max1 = max(temp_x_list)\n", + " y_max1 = max(temp_y_list)\n", + "\n", + " except:\n", + " y_max1 = \"\"\n", + " x_max1 = \"\"\n", + " account_dict_lst.append(\n", + " {\n", + " json_data.entities[i].mention_text: {\n", + " \"id\": id1,\n", + " \"page\": page1,\n", + " \"text_segments\": textSegments1,\n", + " \"x_max\": x_max1,\n", + " \"y_max\": y_max1,\n", + " }\n", + " }\n", + " )\n", + "\n", + " return account_dict_lst\n", + "\n", + " accountnamedict = detials_account(\"account_type\")\n", + " accountnodict = detials_account(\"account_i_number\")\n", + " accountnamedict\n", + " temp_del = []\n", + " for i in range(len(accountnamedict)):\n", + " for k in accountnamedict[i]:\n", + " if re.search(\"\\sstatement\", k, re.IGNORECASE):\n", + " temp_del.append(k)\n", + " for i in range(len(accountnamedict)):\n", + " try:\n", + " for k in accountnamedict[i]:\n", + " for m in temp_del:\n", + " if k == m:\n", + " del accountnamedict[i]\n", + " except:\n", + " pass\n", + " account_comp = []\n", + " for i in range(len(accountnamedict)):\n", + " for j in range(len(accountnodict)):\n", + " for k in accountnamedict[i]:\n", + " for m in accountnodict[j]:\n", + " y_diff = abs(\n", + " accountnamedict[i][k][\"y_max\"] - accountnodict[j][m][\"y_max\"]\n", + " )\n", + " account_comp.append({k: {m: y_diff}})\n", + " final_account_match = {}\n", + " for i in range(len(account_comp)):\n", + " for j in account_comp[i]:\n", + " for k in account_comp[i][j]:\n", + " if j.lower() not in final_account_match.keys():\n", + " final_account_match[j.lower()] = {k: account_comp[i][j][k]}\n", + " else:\n", + " for m in final_account_match:\n", + " for n in final_account_match[m]:\n", + " if j.lower() == m.lower():\n", + " if account_comp[i][j][k] < final_account_match[m][n]:\n", + " final_account_match[j.lower()] = {\n", + " k: account_comp[i][j][k]\n", + " }\n", + " else:\n", + " final_account_match[j.lower()] = {\n", + " n: final_account_match[m][n]\n", + " }\n", + "\n", + " for i in json_data.entities:\n", + " if not hasattr(i, \"properites\"):\n", + " for j in final_account_match:\n", + " for k in final_account_match[j]:\n", + " if i.mention_text:\n", + " if i.mention_text.lower() == j.lower():\n", + " for m in json_data.entities:\n", + " if m.mention_text:\n", + " if m.mention_text.lower() == k.lower():\n", + " i.type = (\"_\").join(\n", + " m.type.split(\"_\")[:2]\n", + " ) + \"_name\"\n", + "\n", + " account_names = {}\n", + " for i in range(len(json_data.entities)):\n", + " if not hasattr(json_data.entities[i], \"properites\"):\n", + " if (\n", + " difflib.SequenceMatcher(\n", + " None, json_data.entities[i].type, \"account_name\"\n", + " ).ratio()\n", + " >= 0.9\n", + " ):\n", + " account_names[json_data.entities[i].mention_text] = {\n", + " \"id\": json_data.entities[i].id,\n", + " \"type\": json_data.entities[i].type,\n", + " }\n", + "\n", + " for i in range(len(json_data.entities)):\n", + " if json_data.entities[i].type == \"account_type\":\n", + " for j in list(account_names.keys()):\n", + " if (\n", + " difflib.SequenceMatcher(\n", + " None, (json_data.entities[i].mention_text).lower(), j.lower()\n", + " ).ratio()\n", + " ) > 0.9:\n", + " json_data.entities[i].type = account_names[j].type\n", + " for i in range(len(json_data.entities)):\n", + " try:\n", + " while json_data.entities[i].type == \"account_type\":\n", + " del json_data.entities[i]\n", + " except Exception as e:\n", + " pass\n", + " return json_data\n", + "\n", + "\n", + "# logging function\n", + "\n", + "\n", + "def logger(filename: str, message: str) -> None:\n", + " \"\"\"\n", + " Function to write the message (error message, warning messgae, info message) to the logging text file.\n", + "\n", + " Parameters\n", + " ----------\n", + " filename : str\n", + " The text file name where the message needs to be written.\n", + " message : str\n", + " The string message from functions(error message, warning messgae, info message).\n", + "\n", + " \"\"\"\n", + " f = open(filename, \"a\")\n", + " f.write(\"{0} -- {1}\\n\".format(datetime.now().strftime(\"%Y-%m-%d %H:%M\"), message))\n", + " f.close()\n", + "\n", + "\n", + "# Borrower name split and page Anchors\n", + "def borrowerNameFix(jsonData):\n", + " \"\"\"\n", + " Function to fix the borrower name present in the document by fixing the suffix, prefix of the name and also by divding the full name into\n", + " smaller chunks of first name, middle name, last name\n", + "\n", + " Parameters\n", + " ----------\n", + " document :documentai.Document\n", + " The document proto having all the entities with the full name of borrower.\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto .\n", + " \"\"\"\n", + " global maxId\n", + " extraDict = documentai.Document()\n", + " for i in jsonData.entities:\n", + " if i.type == \"client_name\":\n", + " extraDict.entities.append(i)\n", + "\n", + " for i in jsonData.entities:\n", + " if i.type == \"client_name\":\n", + " jsonData.entities.remove(i)\n", + "\n", + " def ent_rename_borrower_name(document):\n", + " google_name = \"client_name\"\n", + " entity_values = []\n", + " entity_dict = {}\n", + " dict_ent1 = {}\n", + " for i in range(len(document.entities)):\n", + " if not hasattr(document.entities[i], \"properites\"):\n", + " if document.entities[i].type == google_name:\n", + " entity_dict[document.entities[i].mention_text] = []\n", + " for i in range(len(document.entities)):\n", + " if not hasattr(document.entities[i], \"properites\"):\n", + " if document.entities[i].type == google_name:\n", + " ent_val = entity_values.append(document.entities[i].mention_text)\n", + " if document.entities[i].mention_text in entity_dict.keys():\n", + " entity_dict[document.entities[i].mention_text].append(\n", + " document.entities[i].id\n", + " )\n", + " sorted_list = []\n", + " sorted_dict = {}\n", + " for i in entity_dict:\n", + " temp_list = []\n", + " for j in range(len(entity_dict[i])):\n", + " temp_list.append(int(entity_dict[i][j]))\n", + " sorted_list.append(min(temp_list))\n", + " sorted_list.sort()\n", + " for i in range(len(sorted_list)):\n", + " for j in entity_dict:\n", + " if str(sorted_list[i]) in entity_dict[j]:\n", + " if j not in sorted_dict:\n", + " sorted_dict[j] = i\n", + "\n", + " # return entity_dict,sorted(dict_ent1.items())\n", + " for i in range(len(document.entities)):\n", + " if not hasattr(document.entities[i], \"properites\"):\n", + " if document.entities[i].type == google_name:\n", + " for k in entity_dict[document.entities[i].mention_text]:\n", + " try:\n", + " if document.entities[i].id == k:\n", + " document.entities[i].type = (\n", + " \"Borrower_\"\n", + " + str(\n", + " sorted_dict[document.entities[i].mention_text]\n", + " + 1\n", + " )\n", + " + \"_Full_Name\"\n", + " )\n", + " except:\n", + " pass\n", + "\n", + " return document, entity_dict\n", + "\n", + " def suffix_checker(json_data):\n", + " possible_suffixes = [\"JR\", \"Jr\", \"III\", \"II\", \"MD\", \"PhD\", \"DVM\", \"DDS\"]\n", + " suffix_tracker = {}\n", + " for i in range(len(json_data.entities)):\n", + " if \"name\" in json_data.entities[i].type:\n", + " if json_data.entities[i].mention_text.split()[-1] in possible_suffixes:\n", + " suffix = json_data.entities[i].mention_text.split()[-1]\n", + " borrower_number = json_data.entities[i].type[:10]\n", + " suffix_tracker[borrower_number] = suffix\n", + " json_data.entities[i].mention_text = \" \".join(\n", + " map(str, i.mention_text.split()[:-1])\n", + " )\n", + " temp = copy.deepcopy(json_data.entities[i])\n", + " temp.type = borrower_number + \"_Suffix\"\n", + " temp.text_anchor.text_segments[0].start_index = str(\n", + " int(temp.text_anchor.text_segments[0].end_index - len(suffix))\n", + " )\n", + " temp.mention_text = suffix\n", + " temp.text_anchor[\"content\"] = suffix\n", + " json_data.entities.append(temp)\n", + " return json_data\n", + "\n", + " def split_rename(document, entity_dict):\n", + " import copy\n", + "\n", + " type_three_names = [\"first_name\", \"middle_name\", \"last_name\"]\n", + " type_three_names_with_comma = [\"last_name\", \"middle_name\", \"first_name\"]\n", + " type_two_names = [\"first_name\", \"last_name\"]\n", + " type_two_names_with_comma = [\"last_name\", \"first_name\"]\n", + " prefix = [\"mr\", \"mrs\", \"miss\", \"ms\", \"mx\", \"sir\", \"dr\"]\n", + " try:\n", + " for_entity_count = len(document.entities)\n", + " deleted_entites = []\n", + " for i in range(for_entity_count):\n", + " for j in entity_dict.keys():\n", + " for k in range(len(entity_dict[j])):\n", + " if not document.entities[i].properties:\n", + " if document.entities[i].id == entity_dict[j][k]:\n", + " name = document.entities[i].mention_text.split(\" \")\n", + "\n", + " try:\n", + " if name[0].lower() in prefix:\n", + " k = name[0] + \" \" + name[1]\n", + " name.pop(0)\n", + " name.pop(0)\n", + " name.insert(0, k)\n", + " except:\n", + " pass\n", + "\n", + " if len(name) == 2:\n", + " for m in range(len(name)):\n", + " temp = copy.deepcopy(document.entities[i])\n", + " # del temp.id\n", + " temp.mention_text = name[m]\n", + " index = temp.text_anchor.text_segments[0].copy()\n", + "\n", + " if m == 0:\n", + " temp.text_anchor.text_segments[\n", + " 0\n", + " ].end_index = str(\n", + " int(index.start_index) + len(name[0])\n", + " )\n", + " if name[0].endswith(\",\"):\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_two_names_with_comma[m]\n", + " )\n", + " else:\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_two_names[m]\n", + " )\n", + " else:\n", + " temp.text_anchor.text_segments[\n", + " 0\n", + " ].start_index = str(\n", + " int(index.end_index) - len(name[1])\n", + " )\n", + " if name[0].endswith(\",\"):\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_two_names_with_comma[m]\n", + " )\n", + " else:\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_two_names[m]\n", + " )\n", + " document.entities.append(temp)\n", + " elif len(name) == 3:\n", + " for m in range(len(name)):\n", + " temp = copy.deepcopy(document.entities[i])\n", + " temp.mention_text = name[m]\n", + " index = temp.text_anchor.text_segments[0].copy()\n", + " if m == 0:\n", + " temp.text_anchor.text_segments[\n", + " 0\n", + " ].end_index = str(\n", + " int(index.start_index) + len(name[0])\n", + " )\n", + " if name[0].endswith(\",\"):\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_three_names_with_comma[m]\n", + " )\n", + " else:\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_three_names[m]\n", + " )\n", + " elif k == 1:\n", + " temp.text_anchor.text_segments[\n", + " 0\n", + " ].start_index = str(\n", + " int(index.start_index)\n", + " + len(name[0])\n", + " + 1\n", + " )\n", + " temp.text_anchor.text_segments[\n", + " 0\n", + " ].end_index = str(\n", + " int(index.end_index) - len(name[2])\n", + " )\n", + " if name[0].endswith(\",\"):\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_three_names_with_comma[m]\n", + " )\n", + " else:\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_three_names[m]\n", + " )\n", + " else:\n", + " temp.text_anchor.text_segments[\n", + " 0\n", + " ].start_index = str(\n", + " int(index.end_index) - len(name[2]) + 1\n", + " )\n", + " if name[0].endswith(\",\"):\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_three_names_with_comma[m]\n", + " )\n", + " else:\n", + " temp.type = (\n", + " (\n", + " (\"_\").join(\n", + " temp.type.split(\"_\")[:2]\n", + " )\n", + " )\n", + " + \"_\"\n", + " + type_three_names[m]\n", + " )\n", + " document.entities.append(temp)\n", + "\n", + " except Exception as e:\n", + " print(e, \" :: \", i)\n", + " return document\n", + "\n", + " def text_anchorFix(jsonData, tempVar):\n", + " def text_anchorFixKid(jsonData, entityDict):\n", + " entityDict.text_anchor.content = entityDict.mention_text\n", + " if entityDict.type[-9:] != \"Full_Name\":\n", + " start = int(entityDict.text_anchor.text_segments[0].start_index)\n", + " end = int(entityDict.text_anchor.text_segments[0].end_index)\n", + "\n", + " while (\n", + " entityDict.mention_text != jsonData.text[start:end] and end > start\n", + " ):\n", + " end -= 1\n", + " entityDict.text_anchor.text_segments[0].start_index = str(start)\n", + " entityDict.text_anchor.text_segments[0].end_index = str(end)\n", + " return entityDict\n", + "\n", + " tempVarEntities = []\n", + " for i in tempVar.entities:\n", + " fixedDict = text_anchorFixKid(jsonData, i)\n", + " tempVarEntities.append(i)\n", + "\n", + " tempVar.entities = tempVarEntities\n", + " return tempVar\n", + "\n", + " def page_anchorFix(jsonData, tempVar):\n", + " tokenRange = {}\n", + " for i in range(0, len(jsonData.pages)):\n", + " for j in range(0, len(jsonData.pages[i].tokens)):\n", + " pageNumber = i\n", + " tokenNumber = j\n", + " try:\n", + " startIndex = int(\n", + " jsonData.pages[i]\n", + " .tokens[j]\n", + " .layout.text_anchor.text_segments[0]\n", + " .start_index\n", + " )\n", + " except:\n", + " startIndex = 0\n", + " endIndex = int(\n", + " jsonData.pages[i]\n", + " .tokens[j]\n", + " .layout.text_anchor.text_segments[0]\n", + " .end_index\n", + " )\n", + " tokenRange[range(startIndex, endIndex)] = {\n", + " \"page_number\": pageNumber,\n", + " \"token_number\": tokenNumber,\n", + " }\n", + "\n", + " for i in tempVar.entities:\n", + " if i.type is not \"Borrower_Full_Address\":\n", + " start = int(i.text_anchor.text_segments[0].start_index)\n", + " end = int(i.text_anchor.text_segments[0].end_index) - 1\n", + "\n", + " for j in tokenRange:\n", + " if start in j:\n", + " lowerToken = tokenRange[j]\n", + " for j in tokenRange:\n", + " if end in j:\n", + " upperToken = tokenRange[j]\n", + "\n", + " lowerTokenData = (\n", + " jsonData.pages[int(lowerToken[\"page_number\"])]\n", + " .tokens[int(lowerToken[\"token_number\"])]\n", + " .layout.bounding_poly.normalized_vertices\n", + " )\n", + " upperTokenData = (\n", + " jsonData.pages[int(upperToken[\"page_number\"])]\n", + " .tokens[int(upperToken[\"token_number\"])]\n", + " .layout.bounding_poly.normalized_vertices\n", + " )\n", + " # for A\n", + "\n", + " xA = float(lowerTokenData[0].x)\n", + " yA = float(lowerTokenData[0].y)\n", + " xA_ = float(upperTokenData[0].x)\n", + " yA_ = float(upperTokenData[0].y)\n", + " # for B\n", + " xB = float(lowerTokenData[1].x)\n", + " yB = float(lowerTokenData[1].y)\n", + " xB_ = float(upperTokenData[1].x)\n", + " yB_ = float(upperTokenData[1].y)\n", + " # for C\n", + " xC = float(lowerTokenData[2].x)\n", + " yC = float(lowerTokenData[2].y)\n", + " xC_ = float(upperTokenData[2].x)\n", + " yC_ = float(upperTokenData[2].y)\n", + " # for D\n", + " xD = float(lowerTokenData[3].x)\n", + " yD = float(lowerTokenData[3].y)\n", + " xD_ = float(upperTokenData[3].x)\n", + " yD_ = float(upperTokenData[3].y)\n", + "\n", + " A = {\"x\": min(xA, xA_), \"y\": min(yA, yA_)}\n", + " B = {\"x\": max(xB, xB_), \"y\": min(yB, yB_)}\n", + " C = {\"x\": max(xC, xC_), \"y\": max(yC, yC_)}\n", + " D = {\"x\": min(xD, xD_), \"y\": max(yD, yD_)}\n", + " i.page_anchor.page_refs[0].bounding_poly.normalized_vertices = [\n", + " A,\n", + " B,\n", + " C,\n", + " D,\n", + " ]\n", + " return tempVar\n", + "\n", + " x1, y1 = ent_rename_borrower_name(extraDict)\n", + " extraDict = suffix_checker(extraDict)\n", + " tempVar = split_rename(x1, y1)\n", + "\n", + " tempVar_2 = text_anchorFix(jsonData, tempVar)\n", + "\n", + " tempVar_3 = page_anchorFix(jsonData, tempVar_2)\n", + "\n", + " for i in tempVar_3.entities:\n", + " if i.type[-9:] != \"Full_Name\":\n", + " maxId += 1\n", + " i.id = str(maxId)\n", + " for i in tempVar_3.entities:\n", + " jsonData.entities.append(i)\n", + " return jsonData\n", + "\n", + "\n", + "# Entities statement_start_date,statement_end_date,starting_balance,ending_balance,bank_name\n", + "\n", + "\n", + "def ent_rename(\n", + " document: documentai.Document, google_name: str, specific_name: str\n", + ") -> documentai.Document:\n", + " \"\"\"\n", + " Function to rename the entities given by the user in variable dict_ent_rename.\n", + "\n", + " Parameters\n", + " ----------\n", + " document :documentai.Document\n", + " The document proto having all the entities\n", + " google_name : str\n", + " The entity name present in dict_ent_rename variable as key which need to be replaced .\n", + " specific_name : str\n", + " The specific name which will replace the entity name and present in dict_ent_rename variable as value.\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto .\n", + " \"\"\"\n", + " for i in range(len(document.entities)):\n", + " if not hasattr(document.entities[i], \"properites\"):\n", + " if document.entities[i].type == google_name:\n", + " document.entities[i].type = specific_name\n", + " elif document.entities[i].properties:\n", + " for k in range(len(document.entities[i].properties)):\n", + " if document.entities[i].properties[k].type == google_name:\n", + " document.entities[i].properties[k].type = specific_name\n", + " return document\n", + "\n", + "\n", + "# adding pages entity\n", + "# total pages function\n", + "\n", + "\n", + "def add_total_pages(jsonData):\n", + " \"\"\"\n", + " Function to add the total page number and update the bounding poly.\n", + "\n", + " Parameters\n", + " ----------\n", + " document : :documentai.Document\n", + " The document proto having all the pages data.\n", + "\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto.\n", + " \"\"\"\n", + " pages_dict = documentai.Document.Entity()\n", + " pages_dict.type = \"Total_Pages\"\n", + " total = str(len(jsonData.pages))\n", + " pages_dict.mention_text = total\n", + " # jsonData.entities.append(pages_dict)\n", + " pages_dict.confidence = 1\n", + " tokenRange = {}\n", + " start = jsonData.text.rfind(\"Page\" + \" \" + str(len(jsonData.pages)))\n", + " end = int(start) + 11\n", + "\n", + " for j in range(0, len(jsonData.pages[(len(jsonData.pages)) - 1].tokens)):\n", + " pageNumber = (len(jsonData.pages)) - 1\n", + " tokenNumber = j\n", + " try:\n", + " startIndex = int(\n", + " jsonData.pages[(len(jsonData.pages)) - 1]\n", + " .tokens[j]\n", + " .layout.text_anchor.text_segments[0]\n", + " .start_index\n", + " )\n", + " except:\n", + " startIndex = 0\n", + " endIndex = int(\n", + " jsonData.pages[(len(jsonData.pages)) - 1]\n", + " .tokens[j]\n", + " .layout.text_anchor.text_segments[0]\n", + " .end_index\n", + " )\n", + " tokenRange[range(startIndex, endIndex)] = {\n", + " \"pageNumber\": pageNumber,\n", + " \"tokenNumber\": tokenNumber,\n", + " }\n", + "\n", + " for j in tokenRange:\n", + " if start in j:\n", + " lowerToken = tokenRange[j]\n", + " for j in tokenRange:\n", + " if end in j:\n", + " upperToken = tokenRange[j]\n", + "\n", + " lowerTokenData = (\n", + " jsonData.pages[int(lowerToken[\"pageNumber\"])]\n", + " .tokens[int(lowerToken[\"tokenNumber\"])]\n", + " .layout.bounding_poly.normalized_vertices\n", + " )\n", + " upperTokenData = (\n", + " jsonData.pages[int(upperToken[\"pageNumber\"])]\n", + " .tokens[int(upperToken[\"tokenNumber\"])]\n", + " .layout.bounding_poly.normalized_vertices\n", + " )\n", + "\n", + " # for A\n", + " xA = float(lowerTokenData[0].x)\n", + " yA = float(lowerTokenData[0].y)\n", + " xA_ = float(upperTokenData[0].x)\n", + " yA_ = float(upperTokenData[0].y)\n", + " # for B\n", + " xB = float(lowerTokenData[1].x)\n", + " yB = float(lowerTokenData[1].y)\n", + " xB_ = float(upperTokenData[1].x)\n", + " yB_ = float(upperTokenData[1].y)\n", + " # for C\n", + " xC = float(lowerTokenData[2].x)\n", + " yC = float(lowerTokenData[2].y)\n", + " xC_ = float(upperTokenData[2].x)\n", + " yC_ = float(upperTokenData[2].y)\n", + " # for D\n", + " xD = float(lowerTokenData[3].x)\n", + " yD = float(lowerTokenData[3].y)\n", + " xD_ = float(upperTokenData[3].x)\n", + " yD_ = float(upperTokenData[3].y)\n", + "\n", + " A = {\"x\": min(xA, xA_), \"y\": min(yA, yA_)}\n", + " B = {\"x\": max(xB, xB_), \"y\": min(yB, yB_)}\n", + " C = {\"x\": max(xC, xC_), \"y\": max(yC, yC_)}\n", + " D = {\"x\": min(xD, xD_), \"y\": max(yD, yD_)}\n", + " boundpoly = {}\n", + " boundpoly[\"normalized_vertices\"] = [A, B, C, D]\n", + " # pages_dict.page_anchor.page_refs[0].bounding_poly.normalized_vertices = [A, B, C, D]\n", + " pages_dict.page_anchor = {\n", + " \"page_refs\": [\n", + " {\n", + " \"bounding_poly\": {\"normalized_vertices\": [A, B, C, D]},\n", + " \"page\": str(int(total) - 1),\n", + " }\n", + " ]\n", + " }\n", + " pages_dict.text_anchor = {\n", + " \"content\": pages_dict.mention_text,\n", + " \"text_segments\": [{\"end_index\": str(end), \"start_index\": str(start)}],\n", + " }\n", + " jsonData.entities.append(pages_dict)\n", + "\n", + " return jsonData\n", + "\n", + "\n", + "def delete_empty(document: documentai.Document) -> documentai.Document:\n", + " \"\"\"\n", + " Function remove the enitity from the entities list if the entity is empty\n", + "\n", + " Parameters\n", + " ----------\n", + " document : :documentai.Document\n", + " The document proto having all the entities\n", + "\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto after removal of empty entities\n", + " \"\"\"\n", + " for i in range(len(document.entities)):\n", + " try:\n", + " if document.entities[i] == \"\":\n", + " del document.entities[i]\n", + " except:\n", + " pass\n", + " return document\n", + "\n", + "\n", + "# splitting client_address and creating new entities\n", + "\n", + "\n", + "def has_digit(s: str) -> str:\n", + " \"\"\"\n", + " Function to check if string have digit.\n", + "\n", + " Parameters\n", + " ----------\n", + " s :str\n", + " The string which can have digit.\n", + "\n", + " Returns\n", + " -------\n", + " str\n", + " Returns true or false depends on if digit is present or not.\n", + " \"\"\"\n", + " return any(char.isdigit() for char in s)\n", + "\n", + "\n", + "def parse_last_line(last_line: str) -> Tuple:\n", + " \"\"\"\n", + " Function to parse the address .\n", + "\n", + " Parameters\n", + " ----------\n", + " last_line :str\n", + " The string in address format\n", + "\n", + " Returns\n", + " -------\n", + " Tuple\n", + " Returns the tuple with city,state, zip .\n", + " \"\"\"\n", + " match = re.search(r\"([A-Z]{2})((\\)|\\s|,|\\.)*)(\\d{5})\", last_line)\n", + " # match = re.search(r'([A-Za-z]*)|\\(|([A-Z]{2})\\)|((\\s|,|\\.)*)(\\d{5})', last_line)\n", + "\n", + " if not match:\n", + " return None\n", + " elif match.start() > 0 and last_line[match.start() - 1].isalnum():\n", + " return None\n", + " matched_state_zip = last_line[match.start() : match.end()]\n", + " zip_start = re.search(r\"\\d{5}\", matched_state_zip).start()\n", + "\n", + " state, zip = (\n", + " re.sub(r\"[^\\w\\s]\", \"\", matched_state_zip[0:zip_start].strip()),\n", + " matched_state_zip[zip_start:],\n", + " )\n", + " zip_to_end = last_line[match.start() + zip_start :]\n", + "\n", + " unmatched_tokens = [\n", + " t for t in last_line[0 : match.start()].split() if t and has_digit(t)\n", + " ]\n", + " city_candiates = [\n", + " t for t in last_line[0 : match.start()].split() if t and not has_digit(t)\n", + " ]\n", + "\n", + " city = None\n", + " if city_candiates:\n", + " if len(city_candiates) > 2:\n", + " unmatched_tokens.extend(city_candiates[0:-2])\n", + " city = \" \".join(city_candiates[-2:0])\n", + " else:\n", + " city = \" \".join(city_candiates)\n", + " unmatched = \" \".join(unmatched_tokens)\n", + "\n", + " return (city, state, zip, unmatched, zip_to_end)\n", + "\n", + "\n", + "def split_address_entities(entity_type: str, mention_text: str) -> Dict:\n", + " \"\"\"\n", + " Function to split the address into multiple address line like zip,stae,city,street address.\n", + "\n", + " Parameters\n", + " ----------\n", + " entity_type : str\n", + " The entity name from the document proto object\n", + " mention_text : str\n", + " The OCR text of the entity having the actual data of address\n", + " Returns\n", + " -------\n", + " Dict\n", + " Returns the dictonary object with the splitted address inthe form of entity as key and value as text.\n", + " \"\"\"\n", + " text_lines = [line.strip() for line in mention_text.split(\"\\n\") if line.strip()]\n", + " if len(text_lines) == 2 or len(text_lines) == 3:\n", + " parsing = parse_last_line(text_lines[-1])\n", + " if parsing is not None and parsing[0] is not None:\n", + " if (\n", + " len(text_lines) == 3\n", + " and not has_digit(text_lines[0])\n", + " and \"box\" not in text_lines[0].casefold()\n", + " and not text_lines[0].startswith(\"o \")\n", + " ):\n", + " del text_lines[0]\n", + " line2 = text_lines[1] if len(text_lines) == 3 else \"\"\n", + " return {\n", + " f\"{entity_type}_StreetAddressOrPostalBox\": text_lines[0],\n", + " f\"{entity_type}_AdditionalStreetAddressOrPostalBox\": line2,\n", + " f\"{entity_type}_City\": parsing[0],\n", + " f\"{entity_type}_State\": parsing[1],\n", + " f\"{entity_type}_Zip\": parsing[4],\n", + " }\n", + "\n", + " if len(text_lines) == 1:\n", + " parsing = parse_last_line(text_lines[0])\n", + " if parsing is None:\n", + " raise ValueError(\"Likely invalid redaction.\")\n", + " else:\n", + " return {\n", + " f\"{entity_type}_StreetAddressOrPostalBox\": parsing[3],\n", + " f\"{entity_type}_City\": parsing[0] if parsing[0] else \"\",\n", + " f\"{entity_type}_State\": parsing[1],\n", + " f\"{entity_type}_Zip\": parsing[2],\n", + " }\n", + " else:\n", + " last_line_candidates = [\n", + " i for i in range(len(text_lines)) if parse_last_line(text_lines[i])\n", + " ]\n", + " if not last_line_candidates:\n", + " all_tokens = \" \".join(text_lines).split()\n", + " state_token_id, zip_token_id = None, None\n", + " for i, token in enumerate(all_tokens):\n", + " if re.fullmatch(r\"([A-Z]{2})((\\s|,|\\.)*)\", token):\n", + " state_token_id = i\n", + " if re.fullmatch(r\"\\d{5}\", token):\n", + " zip_token_id = i\n", + "\n", + " if state_token_id is None or zip_token_id is None:\n", + " raise ValueError(\"Likely invalid redaction, no zip or state.\")\n", + " else:\n", + " search_start = max(min(state_token_id, zip_token_id) - 1, 0)\n", + " search_end = min(max(state_token_id, zip_token_id) + 2, len(all_tokens))\n", + " city_candidates = [\n", + " i\n", + " for i in range(search_start, search_end)\n", + " if i not in [state_token_id, zip_token_id]\n", + " and not has_digit(all_tokens[i])\n", + " ]\n", + "\n", + " ids_to_remove = [state_token_id, zip_token_id]\n", + " city = None\n", + " if city_candidates:\n", + " city = all_tokens[city_candidates[-1]]\n", + " ids_to_remove.append(city_candidates[-1])\n", + " line_1 = \" \".join(\n", + " [\n", + " all_tokens[i]\n", + " for i in range(len(all_tokens))\n", + " if i not in ids_to_remove\n", + " ]\n", + " )\n", + " return {\n", + " f\"{entity_type}_StreetAddressOrPostalBox\": line_1,\n", + " f\"{entity_type}_City\": city if city else \"\",\n", + " f\"{entity_type}_State\": all_tokens[state_token_id],\n", + " f\"{entity_type}_Zip\": all_tokens[zip_token_id],\n", + " }\n", + " else:\n", + " last_line_id = max(last_line_candidates)\n", + " parsing = parse_last_line(text_lines[last_line_id])\n", + " remaining_lines = text_lines[0:last_line_id]\n", + " if not remaining_lines:\n", + " return {\n", + " f\"{entity_type}_StreetAddressOrPostalBox\": parsing[3],\n", + " f\"{entity_type}_City\": parsing[0] if parsing[0] else \"\",\n", + " f\"{entity_type}_State\": parsing[1],\n", + " f\"{entity_type}_Zip\": parsing[2],\n", + " }\n", + " else:\n", + " city = parsing[0]\n", + " if city is None and not has_digit(remaining_lines[-1]):\n", + " city = remaining_lines[-1]\n", + " remaining_lines = remaining_lines[0:-1]\n", + " if parsing[3]:\n", + " remaining_lines.append(parsing[3])\n", + " line_1, line_2 = None, None\n", + " if remaining_lines:\n", + " line_1 = remaining_lines[0]\n", + " if len(remaining_lines) > 1:\n", + " line_2 = \" \".join(remaining_lines[1:])\n", + " return {\n", + " f\"{entity_type}_StreetAddressOrPostalBox\": line_1 if line_1 else \"\",\n", + " f\"{entity_type}_AdditionalStreetAddressOrPostalBox\": line_2\n", + " if line_2\n", + " else \"\",\n", + " f\"{entity_type}_City\": city if city else \"\",\n", + " f\"{entity_type}_State\": parsing[1],\n", + " f\"{entity_type}_Zip\": parsing[2],\n", + " }\n", + "\n", + "\n", + "# Replacing the Address , splitting and page anchors\n", + "\n", + "\n", + "def address_function(data: documentai.Document) -> documentai.Document:\n", + " \"\"\"\n", + " Function to fix the address by the entity name (ex : Borrower_Street_Address,Borrower_City,Borrower_State,Borrower_Zip)\n", + " by fixing the text anchor, page anchor.\n", + "\n", + " Parameters\n", + " ----------\n", + " data : documentai.Document\n", + " The document proto data having the entities which needs to be change.\n", + "\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto object.\n", + " \"\"\"\n", + " global maxId\n", + " newData = documentai.Document()\n", + " for i in data.entities:\n", + " if i.type == \"client_address\":\n", + " newData.entities.append(i)\n", + " data.entities.remove(i)\n", + "\n", + " def address_text_anchor(jsonData, tempVar):\n", + " # text_anchor fix\n", + " for i in tempVar.entities:\n", + " if i.type == \"Borrower_Full_Address\":\n", + " start = int(i.text_anchor.text_segments[0].start_index)\n", + " end = int(i.text_anchor.text_segments[0].end_index)\n", + " else:\n", + " i.mention_text = i.mention_text.replace(\"\\\\n\", \" \")\n", + " end = start + len(i.mention_text)\n", + " while i.mention_text.split() != jsonData.text[start:end].split() and (\n", + " end < len(jsonData.text) - 1\n", + " ):\n", + " start += 1\n", + " end += 1\n", + " i.text_anchor.text_segments[0].start_index = str(start)\n", + " i.text_anchor.text_segments[0].end_index = str(end)\n", + " i.text_anchor.content = i.mention_text\n", + " start = end\n", + " return tempVar\n", + "\n", + " def address_page_anchor(jsonData, tempVar):\n", + " tokenRange = {}\n", + " for i in range(0, len(jsonData.pages)):\n", + " for j in range(0, len(jsonData.pages[i].tokens)):\n", + " pageNumber = i\n", + " tokenNumber = j\n", + " try:\n", + " startIndex = int(\n", + " jsonData.pages[i]\n", + " .tokens[j]\n", + " .layout.text_anchor.text_segments[0]\n", + " .start_index\n", + " )\n", + " except:\n", + " startIndex = 0\n", + " endIndex = int(\n", + " jsonData.pages[i]\n", + " .tokens[j]\n", + " .layout.text_anchor.text_segments[0]\n", + " .end_index\n", + " )\n", + " tokenRange[range(startIndex, endIndex)] = {\n", + " \"pageNumber\": pageNumber,\n", + " \"tokenNumber\": tokenNumber,\n", + " }\n", + "\n", + " for i in tempVar.entities:\n", + " if i.type is not \"Borrower_Full_Address\":\n", + " start = int(i.text_anchor.text_segments[0].start_index)\n", + " end = int(i.text_anchor.text_segments[0].end_index) - 1\n", + "\n", + " for j in tokenRange:\n", + " if start in j:\n", + " lowerToken = tokenRange[j]\n", + " for j in tokenRange:\n", + " if end in j:\n", + " upperToken = tokenRange[j]\n", + "\n", + " lowerTokenData = (\n", + " jsonData.pages[int(lowerToken[\"pageNumber\"])]\n", + " .tokens[int(lowerToken[\"tokenNumber\"])]\n", + " .layout.bounding_poly.normalized_vertices\n", + " )\n", + " upperTokenData = (\n", + " jsonData.pages[int(upperToken[\"pageNumber\"])]\n", + " .tokens[int(upperToken[\"tokenNumber\"])]\n", + " .layout.bounding_poly.normalized_vertices\n", + " )\n", + " # for A\n", + " # for A\n", + " xA = float(lowerTokenData[0].x)\n", + " yA = float(lowerTokenData[0].y)\n", + " xA_ = float(upperTokenData[0].x)\n", + " yA_ = float(upperTokenData[0].y)\n", + " # for B\n", + " xB = float(lowerTokenData[1].x)\n", + " yB = float(lowerTokenData[1].y)\n", + " xB_ = float(upperTokenData[1].x)\n", + " yB_ = float(upperTokenData[1].y)\n", + " # for C\n", + " xC = float(lowerTokenData[2].x)\n", + " yC = float(lowerTokenData[2].y)\n", + " xC_ = float(upperTokenData[2].x)\n", + " yC_ = float(upperTokenData[2].y)\n", + " # for D\n", + " xD = float(lowerTokenData[3].x)\n", + " yD = float(lowerTokenData[3].y)\n", + " xD_ = float(upperTokenData[3].x)\n", + " yD_ = float(upperTokenData[3].y)\n", + "\n", + " A = {\"x\": min(xA, xA_), \"y\": min(yA, yA_)}\n", + " B = {\"x\": max(xB, xB_), \"y\": min(yB, yB_)}\n", + " C = {\"x\": max(xC, xC_), \"y\": max(yC, yC_)}\n", + " D = {\"x\": min(xD, xD_), \"y\": max(yD, yD_)}\n", + " i.page_anchor.page_refs[0].bounding_poly.normalized_vertices = [\n", + " A,\n", + " B,\n", + " C,\n", + " D,\n", + " ]\n", + " return tempVar\n", + "\n", + " def address_function_new(data):\n", + " deleted_entities = []\n", + " address_entity_names = [\n", + " \"Borrower_Street_Address\",\n", + " \"Borrower_City\",\n", + " \"Borrower_State\",\n", + " \"Borrower_Zip\",\n", + " ]\n", + " address_entity_name_and_value = {}\n", + " address_parser = AddressParser(device=0) # On GPU device 0\n", + " for i in range(len(data.entities)):\n", + " try:\n", + " if data.entities[i].type == \"client_address\":\n", + " deleted_entities.append(i)\n", + " full_address = \" \".join(data.entities[i].mention_text.split())\n", + " parse_address = address_parser(full_address)\n", + "\n", + " StreetNameMatch = re.search(\n", + " (\n", + " parse_address.StreetNumber\n", + " + \" \"\n", + " + parse_address.StreetName\n", + " + \" \"\n", + " ),\n", + " full_address,\n", + " flags=re.IGNORECASE,\n", + " )\n", + " address_entity_name_and_value[\n", + " \"Borrower_Street_Address\"\n", + " ] = full_address[StreetNameMatch.start() : StreetNameMatch.end()]\n", + "\n", + " CityNameMatch = re.search(\n", + " parse_address.Municipality, full_address, flags=re.IGNORECASE\n", + " )\n", + " address_entity_name_and_value[\"Borrower_City\"] = full_address[\n", + " CityNameMatch.start() : CityNameMatch.end() + 1\n", + " ]\n", + "\n", + " StateNameMatch = re.search(\n", + " parse_address.Province.upper(), full_address\n", + " )\n", + " address_entity_name_and_value[\"Borrower_State\"] = full_address[\n", + " StateNameMatch.start() : StateNameMatch.end()\n", + " ]\n", + "\n", + " PostalCodeMatch = re.search(parse_address.PostalCode, full_address)\n", + " address_entity_name_and_value[\"Borrower_Zip\"] = full_address[\n", + " PostalCodeMatch.start() : PostalCodeMatch.end() + 1\n", + " ]\n", + "\n", + " for j in range(4):\n", + " temp = copy.deepcopy(data.entities[i])\n", + " temp.type = address_entity_names[j]\n", + " temp.mention_text = address_entity_name_and_value[\n", + " address_entity_names[j]\n", + " ]\n", + " data.entities.append(temp)\n", + "\n", + " except:\n", + " print(\"Can't split full_address in sub-parts using deepParse\")\n", + " print(data.entities[i])\n", + " if data.entities[i].type == \"client_address\":\n", + " deleted_entities.append(i)\n", + " s = data.entities[i].mention_text\n", + " split_result = split_address_entities(\"client_address\", s)\n", + " for j in range(4):\n", + " temp = copy.deepcopy(data.entities[i])\n", + " temp.type = address_entity_names[j]\n", + " if address_entity_names[j] == \"Borrower_Street_Address\":\n", + " if (\n", + " \"client_address_AdditionalStreetAddressOrPostalBox\"\n", + " in split_result.keys()\n", + " ):\n", + " temp.mention_text = (\n", + " split_result[\n", + " \"client_address_StreetAddressOrPostalBox\"\n", + " ]\n", + " + split_result[\n", + " \"client_address_AdditionalStreetAddressOrPostalBox\"\n", + " ]\n", + " )\n", + " else:\n", + " temp.mention_text = split_result[\n", + " \"client_address_StreetAddressOrPostalBox\"\n", + " ]\n", + " elif address_entity_names[j] == \"Borrower_City\":\n", + " temp.mention_text = split_result[\"client_address_City\"]\n", + " elif address_entity_names[j] == \"Borrower_State\":\n", + " temp.mention_text = split_result[\"client_address_State\"]\n", + " else:\n", + " temp.mention_text = split_result[\"client_address_Zip\"]\n", + " data.entities.append(temp)\n", + " for i in deleted_entities[::-1]:\n", + " data.entities[i].type = \"Borrower_Full_Address\"\n", + " return data\n", + "\n", + " tempVar = address_function_new(newData)\n", + " tempVar_2 = address_text_anchor(data, tempVar)\n", + " tempVar_3 = address_page_anchor(data, tempVar)\n", + " for i in tempVar_3.entities:\n", + " if i.type != \"Borrower_Full_Address\":\n", + " maxId += 1\n", + " i.id = str(maxId)\n", + " for i in tempVar_3.entities:\n", + " data.entities.append(i)\n", + " return data\n", + "\n", + "\n", + "def fixAccountBalance(document: documentai.Document) -> documentai.Document:\n", + " \"\"\"\n", + " Function to fix the account balance by updatding the ,ention text of the entities.\n", + "\n", + " Parameters\n", + " ----------\n", + " jsonData : documentai.Document\n", + " The document proto data having the entities which needs to be change.\n", + "\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto object.\n", + " \"\"\"\n", + " from collections import Counter\n", + "\n", + " def most_frequent(List):\n", + " occurence_count = Counter(List)\n", + " return occurence_count.most_common(1)[0][0]\n", + "\n", + " tempDict = {}\n", + " beginning_balance_unique = []\n", + " ending_balance_unique = []\n", + " for i in document.entities:\n", + " if \"beginning_balance\" in i.type:\n", + " if i.type not in beginning_balance_unique:\n", + " beginning_balance_unique.append(i.type)\n", + " if \"ending_balance\" in i.type:\n", + " if i.type not in ending_balance_unique:\n", + " ending_balance_unique.append(i.type)\n", + " beg_end_dict = {}\n", + " for i in beginning_balance_unique:\n", + " temp = []\n", + "\n", + " for j in range(0, len(document.entities)):\n", + " if i == document.entities[j].type:\n", + " temp.append(document.entities[j].mention_text.strip(\"$#\"))\n", + " beg_end_dict[i] = most_frequent(temp)\n", + " for i in ending_balance_unique:\n", + " temp = []\n", + " for j in range(0, len(document.entities)):\n", + " if i == document.entities[j].type:\n", + " temp.append(document.entities[j].mention_text.strip(\"$#\"))\n", + " beg_end_dict[i] = most_frequent(temp)\n", + " for i in document.entities:\n", + " if i.type in beg_end_dict.keys():\n", + " if i.mention_text.strip(\"$#\") != beg_end_dict[\n", + " i.type\n", + " ] and i.mention_text.strip(\"$#\") in list(beg_end_dict.values()):\n", + " i.type = list(beg_end_dict.keys())[\n", + " list(beg_end_dict.values()).index(i.mention_text.strip(\"$#\"))\n", + " ]\n", + " elif i.mention_text.strip(\"$#\") != beg_end_dict[i.type]:\n", + " document.entities.remove(i)\n", + " return document\n", + "\n", + "\n", + "def Boundary_markers(\n", + " jsonData: documentai.Document,\n", + ") -> documentai.Document: # TODO : check jsonDict for dict or entity obj\n", + " \"\"\"\n", + " Function to mark the boundary bonding boxes for the required entities.\n", + "\n", + " Parameters\n", + " ----------\n", + " jsonData : documentai.Document\n", + " The document proto data having the entities which needs to be change.\n", + "\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto object.\n", + " \"\"\"\n", + " allEntities = jsonData.entities\n", + " noOfEntitiesInJsonFile = len(allEntities)\n", + "\n", + " # Find entityIdSchema of Json\n", + " entityIdSchema = {}\n", + " for i in range(0, noOfEntitiesInJsonFile):\n", + " try:\n", + " if allEntities[i].id:\n", + " entityIdSchema[i] = [int(allEntities[i].id)]\n", + " except:\n", + " temp_arr = []\n", + " for j in allEntities[i].properties:\n", + " temp_arr.append(int(j.id))\n", + " entityIdSchema[i] = temp_arr\n", + "\n", + " # Single Level Entities file : jsonDict\n", + " jsonDict = {\n", + " \"confidence\": [],\n", + " \"id\": [],\n", + " \"mention_text\": [],\n", + " \"normalized_value\": [],\n", + " \"page_anchor\": [],\n", + " \"text_anchor\": [],\n", + " \"type\": [],\n", + " }\n", + " entitiesArray = []\n", + "\n", + " for i in range(0, noOfEntitiesInJsonFile):\n", + " try:\n", + " if allEntities[i].id:\n", + " entitiesArray.append(allEntities[i])\n", + " except:\n", + " for j in allEntities[i].properties:\n", + " entitiesArray.append(j)\n", + " entitiesArray = sorted(entitiesArray, key=lambda x: x.id)\n", + " for i in range(0, len(entitiesArray)):\n", + " try:\n", + " jsonDict[\"confidence\"].append(entitiesArray[i].confidence)\n", + " except:\n", + " jsonDict[\"confidence\"].append(\"\")\n", + " try:\n", + " jsonDict[\"id\"].append(entitiesArray[i].id)\n", + " except:\n", + " jsonDict[\"id\"].append(\"\")\n", + " try:\n", + " jsonDict[\"mention_text\"].append(entitiesArray[i].mention_text)\n", + " except:\n", + " jsonDict[\"mention_text\"].append(\"\")\n", + " try:\n", + " jsonDict[\"normalized_value\"].append(entitiesArray[i].normalized_value)\n", + " except:\n", + " jsonDict[\"normalized_value\"].append(\"\")\n", + " try:\n", + " jsonDict[\"page_anchor\"].append(entitiesArray[i].page_anchor)\n", + " except:\n", + " jsonDict[\"page_anchor\"].append(\"\")\n", + " try:\n", + " jsonDict[\"text_anchor\"].append(entitiesArray[i].text_anchor)\n", + " except:\n", + " jsonDict[\"text_anchor\"].append(\"\")\n", + " try:\n", + " jsonDict[\"type\"].append(entitiesArray[i].type)\n", + " except:\n", + " jsonDict[\"type\"].append(\"\")\n", + "\n", + " # No startIndex handeling\n", + " for i in range(0, len(jsonDict[\"type\"])):\n", + " try:\n", + " if jsonDict[\"text_anchor\"][i][\"text_segments\"][0][\"start_index\"]:\n", + " pass\n", + " except:\n", + " try:\n", + " jsonDict[\"text_anchor\"][i][\"text_segments\"][0][\"start_index\"] = \"0\"\n", + " except:\n", + " pass\n", + " accountNumbers = dict()\n", + " for i in range(0, len(jsonDict[\"id\"])):\n", + " if jsonDict[\"type\"][i] == \"account_number\":\n", + " if (\n", + " re.sub(\"\\D\", \"\", jsonDict[\"mention_text\"][i].strip(\".#:' \"))\n", + " not in accountNumbers\n", + " and len(re.sub(\"\\D\", \"\", jsonDict[\"mention_text\"][i].strip(\".#:' \")))\n", + " > 5\n", + " ):\n", + " accountNumbers[\n", + " re.sub(\"\\D\", \"\", jsonDict[\"mention_text\"][i].strip(\".#:' \"))\n", + " ] = (\"account_\" + str(len(accountNumbers)) + \"_number\")\n", + " account_number_dict = {}\n", + " import sys\n", + "\n", + " accountNumberDict = {}\n", + " accountNumberPageDict = {}\n", + " for i in accountNumbers.keys():\n", + " temp_list = []\n", + " temp_page_list = set()\n", + " for j in range(len(jsonDict[\"mention_text\"])):\n", + " if re.sub(\"\\D\", \"\", jsonDict[\"mention_text\"][j].strip(\".#:' \")) == i:\n", + " page = 0\n", + " if jsonDict[\"page_anchor\"][j][\"page_refs\"][0][\"page\"]:\n", + " page = int(jsonDict[\"page_anchor\"][j][\"page_refs\"][0][\"page\"])\n", + " temp_list.append(\n", + " (\n", + " int(\n", + " jsonDict[\"text_anchor\"][j][\"text_segments\"][0][\n", + " \"start_index\"\n", + " ]\n", + " ),\n", + " int(\n", + " jsonDict[\"text_anchor\"][j][\"text_segments\"][0][\"end_index\"]\n", + " ),\n", + " page,\n", + " )\n", + " )\n", + " temp_page_list.add(page)\n", + " accountNumberPageDict[accountNumbers[i]] = temp_page_list\n", + " accountNumberDict[accountNumbers[i]] = temp_list\n", + " n = set(range(len(jsonData.pages)))\n", + " for i in accountNumberPageDict:\n", + " n = n & accountNumberPageDict[i]\n", + "\n", + " n = list(n)\n", + " for i in accountNumberDict:\n", + " accountNumberDict[i].sort(key=lambda x: x[2])\n", + " accountNumbersToDelete = []\n", + " for i in accountNumberDict:\n", + " if i != \"account_0_number\":\n", + " minStartIndex = sys.maxsize\n", + " minEndIndex = sys.maxsize\n", + " minPage = sys.maxsize\n", + " tuppleToRemove = []\n", + " if len(accountNumberDict[i]) > 1:\n", + " for j in accountNumberDict[i]:\n", + " if (\n", + " j[2] in n\n", + " and j[2] < 3\n", + " and len(tuppleToRemove) < len(accountNumberDict[i]) - 1\n", + " ):\n", + " tuppleToRemove.append(j)\n", + " else:\n", + " minStartIndex = min(minStartIndex, j[0])\n", + " minEndIndex = min(minEndIndex, j[1])\n", + " minPage = min(minPage, j[2])\n", + " tuppleToRemove.append(j)\n", + " # accountNumberDict[i]=[(minStartIndex,minEndIndex,minPage)]\n", + " for k in tuppleToRemove:\n", + " accountNumberDict[i].remove(k)\n", + " accountNumberDict[i] = [(minStartIndex, minEndIndex, minPage)]\n", + " else:\n", + " accountNumbersToDelete.append(i)\n", + " else:\n", + " minStartIndex = 0\n", + " minEndIndex = 0\n", + " minPage = sys.maxsize\n", + " tuppleToRemove = []\n", + " for j in accountNumberDict[i]:\n", + " if j[2] == n[0]:\n", + " minStartIndex = max(minStartIndex, j[0])\n", + " minEndIndex = max(minEndIndex, j[1])\n", + " minPage = min(minPage, j[2])\n", + " tuppleToRemove.append(j)\n", + " for k in tuppleToRemove:\n", + " accountNumberDict[i].remove(k)\n", + " accountNumberDict[\"account_0_number\"] = [\n", + " (minStartIndex, minEndIndex, minPage)\n", + " ]\n", + " for i in accountNumbersToDelete:\n", + " del accountNumberDict[i]\n", + "\n", + " if len(accountNumbers) > 1:\n", + " borderIndex = []\n", + " for i in accountNumberDict:\n", + " borderIndex.append((accountNumberDict[i][0][0], accountNumberDict[i][0][1]))\n", + "\n", + " regionSplitter = []\n", + " for i in range(0, len(borderIndex)):\n", + " regionSplitter.append(borderIndex[i][0])\n", + "\n", + " # regionSplitterDict = {0 : 'account_summary'}\n", + " regionSplitterDict = {}\n", + " for i in range(0, len(regionSplitter)):\n", + " regionSplitterDict[int(regionSplitter[i])] = \"account_\" + str(i)\n", + " regionSplitterDict[len(jsonData[\"text\"])] = \"last_index\"\n", + "\n", + " else:\n", + " tempVar = len(jsonData.text)\n", + " regionSplitterDict = {tempVar: \"account_0\"}\n", + " regionSplitterDict[len(jsonData.text) + 1] = \"last_index\"\n", + "\n", + " for i in range(0, len(jsonDict[\"id\"])):\n", + " if (\n", + " jsonDict[\"type\"][i] == \"account_number\"\n", + " and len(re.sub(\"\\D\", \"\", jsonDict[\"mention_text\"][i].strip(\".#:' \"))) > 5\n", + " ):\n", + " jsonDict[\"type\"][i] = accountNumbers[\n", + " re.sub(\"\\D\", \"\", jsonDict[\"mention_text\"][i].strip(\".#:' \"))\n", + " ]\n", + "\n", + " for i in range(0, len(jsonDict[\"id\"])):\n", + " try:\n", + " si = jsonDict[\"text_anchor\"][i][\"text_segments\"][0][\"start_index\"]\n", + " except:\n", + " continue\n", + "\n", + " if jsonDict[\"type\"][i] == \"starting_balance\":\n", + " for j in range(1, len(regionSplitterDict)):\n", + " if int(si) < list(regionSplitterDict.keys())[j]:\n", + " jsonDict[\"type\"][i] = (\n", + " regionSplitterDict[list(regionSplitterDict.keys())[j - 1]]\n", + " + \"_beginning_balance\"\n", + " )\n", + " break\n", + " if jsonDict[\"type\"][i] == \"ending_balance\":\n", + " for j in range(1, len(regionSplitterDict)):\n", + " if int(si) < list(regionSplitterDict.keys())[j]:\n", + " jsonDict[\"type\"][i] = (\n", + " regionSplitterDict[list(regionSplitterDict.keys())[j - 1]]\n", + " + \"_ending_balance\"\n", + " )\n", + " break\n", + "\n", + " if jsonDict[\"type\"][i] == \"table_item/transaction_deposit_date\":\n", + " for j in range(1, len(regionSplitterDict)):\n", + " if int(si) < list(regionSplitterDict.keys())[j]:\n", + " jsonDict[\"type\"][i] = (\n", + " regionSplitterDict[list(regionSplitterDict.keys())[j - 1]]\n", + " + \"_transaction\"\n", + " + \"/\"\n", + " + \"deposit_date\"\n", + " )\n", + " break\n", + "\n", + " if jsonDict[\"type\"][i] == \"table_item/transaction_deposit_description\":\n", + " for j in range(1, len(regionSplitterDict)):\n", + " if int(si) < list(regionSplitterDict.keys())[j]:\n", + " jsonDict[\"type\"][i] = (\n", + " regionSplitterDict[list(regionSplitterDict.keys())[j - 1]]\n", + " + \"_transaction\"\n", + " + \"/\"\n", + " + \"deposit_desc\"\n", + " )\n", + " break\n", + "\n", + " if jsonDict[\"type\"][i] == \"table_item/transaction_deposit\":\n", + " for j in range(1, len(regionSplitterDict)):\n", + " if int(si) < list(regionSplitterDict.keys())[j]:\n", + " jsonDict[\"type\"][i] = (\n", + " regionSplitterDict[list(regionSplitterDict.keys())[j - 1]]\n", + " + \"_transaction\"\n", + " + \"/\"\n", + " + \"deposit_amount\"\n", + " )\n", + " break\n", + "\n", + " if jsonDict[\"type\"][i] == \"table_item/transaction_withdrawal_date\":\n", + " for j in range(1, len(regionSplitterDict)):\n", + " if int(si) < list(regionSplitterDict.keys())[j]:\n", + " jsonDict[\"type\"][i] = (\n", + " regionSplitterDict[list(regionSplitterDict.keys())[j - 1]]\n", + " + \"_transaction\"\n", + " + \"/\"\n", + " + \"withdraw_date\"\n", + " )\n", + " break\n", + "\n", + " if jsonDict[\"type\"][i] == \"table_item/transaction_withdrawal_description\":\n", + " for j in range(1, len(regionSplitterDict)):\n", + " if int(si) < list(regionSplitterDict.keys())[j]:\n", + " jsonDict[\"type\"][i] = (\n", + " regionSplitterDict[list(regionSplitterDict.keys())[j - 1]]\n", + " + \"_transaction\"\n", + " + \"/\"\n", + " + \"withdraw_desc\"\n", + " )\n", + " break\n", + "\n", + " if jsonDict[\"type\"][i] == \"table_item/transaction_withdrawal\":\n", + " for j in range(1, len(regionSplitterDict)):\n", + " if int(si) < list(regionSplitterDict.keys())[j]:\n", + " jsonDict[\"type\"][i] = (\n", + " regionSplitterDict[list(regionSplitterDict.keys())[j - 1]]\n", + " + \"_transaction\"\n", + " + \"/\"\n", + " + \"withdraw_amount\"\n", + " )\n", + " break\n", + "\n", + " newEntitiesArray = documentai.Document()\n", + " for i in range(0, len(entitiesArray)):\n", + " newEntitiesArray.entities.append(\n", + " {\n", + " \"confidence\": jsonDict[\"confidence\"][i],\n", + " \"id\": jsonDict[\"id\"][i],\n", + " \"mention_text\": jsonDict[\"mention_text\"][i],\n", + " \"normalized_value\": jsonDict[\"normalized_value\"][i],\n", + " \"page_anchor\": jsonDict[\"page_anchor\"][i],\n", + " \"text_anchor\": jsonDict[\"text_anchor\"][i],\n", + " \"type\": jsonDict[\"type\"][i],\n", + " }\n", + " )\n", + "\n", + " newEntitiesArrayToIdDict = {}\n", + " for i in newEntitiesArray.entities:\n", + " newEntitiesArrayToIdDict[int(i.id)] = i\n", + "\n", + " allEntitiesNewArray = (\" \" * len(entityIdSchema)).split(\" \")\n", + " for i in entityIdSchema:\n", + " if len(entityIdSchema[i]) == 1:\n", + " allEntitiesNewArray[i] = newEntitiesArrayToIdDict[entityIdSchema[i][0]]\n", + " else:\n", + " tempA = []\n", + " for j in range(0, len(entityIdSchema[i])):\n", + " tempA.append(newEntitiesArrayToIdDict[entityIdSchema[i][j]])\n", + " allEntitiesNewArray[i] = allEntities[i]\n", + " allEntitiesNewArray[i].properties = tempA\n", + " allEntitiesNewArray = [x for x in allEntitiesNewArray if x]\n", + " for i in allEntitiesNewArray:\n", + " if i == \"\":\n", + " allEntitiesNewArray.remove(i)\n", + " if i.id:\n", + " if i.id == \"\":\n", + " del i.id\n", + " if i.normalized_value:\n", + " if i.normalized_value == \"\":\n", + " del i.normalized_value\n", + " if i.confidence:\n", + " if i.confidence == \"\":\n", + " del i.confidence\n", + " if i.page_anchor:\n", + " if i.page_anchor == \"\":\n", + " del i.page_anchor\n", + " if i.mention_text:\n", + " if i.mention_text == \"\":\n", + " del i.mention_text\n", + " if i.text_anchor:\n", + " if i.text_anchor == \"\":\n", + " del i.text_anchor\n", + " if i.properties:\n", + " for j in i.properties:\n", + " if j.normalized_value:\n", + " if j.normalized_value == \"\":\n", + " del j.normalized_value\n", + " if j.confidence:\n", + " if j.confidence == \"\":\n", + " del j.confidence\n", + " if j.page_anchor:\n", + " if j.page_anchor == \"\":\n", + " del j.page_anchor\n", + " if j.id:\n", + " if j.id == \"\":\n", + " del j.id\n", + " if j.mention_text:\n", + " if j.mention_text == \"\":\n", + " del j.mention_text\n", + " if j.text_anchor:\n", + " if j.text_anchor == \"\":\n", + " del j.text_anchor\n", + " for i in allEntitiesNewArray:\n", + " if i.type == \"table_item\":\n", + " account_prefix = i.properties[0].type.split(\"/\")[0]\n", + " i.type = account_prefix\n", + "\n", + " newJsonData = jsonData\n", + " newJsonData.entities = allEntitiesNewArray\n", + "\n", + " return newJsonData\n", + "\n", + "\n", + "def groupChecks(jsonData: documentai.Document, bankName: str) -> documentai.Document:\n", + " \"\"\"\n", + " Function to check for the bank name if they falls in top 3 banks(Wells Fargo, Bank of America, Chase),\n", + " if it found in the list entities will get sort.\n", + "\n", + " Parameters\n", + " ----------\n", + " jsonData : documentai.Document\n", + " The document proto data having the entities which needs to be change.\n", + " bankName :str\n", + " bank name which are present in document OCR and need to be check if it falls in top 3 banks.\n", + "\n", + " Returns\n", + " -------\n", + " documentai.Document\n", + " Returns the updated document proto object.\n", + " \"\"\"\n", + " # dictionary storing format of the table for top 3 banks\n", + " bankFormat = {\n", + " \"wellsfargo\": [\"check_number\", \"check_date\", \"check_amount\"],\n", + " \"bankofamerica\": [\"check_date\", \"check_number\", \"check_amount\"],\n", + " \"chase\": [\"check_number\", \"check_date\", \"check_amount\"],\n", + " }\n", + " bankChecksColumn = bankFormat[bankName]\n", + " allEntities = jsonData.entities\n", + " noOfEntitiesInJsonFile = len(allEntities)\n", + "\n", + " # Find entityIdSchema of Json\n", + " entityIdSchema = {}\n", + " for i in range(0, noOfEntitiesInJsonFile):\n", + " try:\n", + " if allEntities[i].id:\n", + " entityIdSchema[i] = [int(allEntities[i].id)]\n", + " except:\n", + " temp_arr = []\n", + " for j in allEntities[i].properties:\n", + " temp_arr.append(int(j.id))\n", + " entityIdSchema[i] = temp_arr\n", + "\n", + " # Single Level Entities file : jsonDict\n", + " jsonDict = documentai.Document.Entity()\n", + "\n", + " entitiesArray = []\n", + "\n", + " for i in range(0, noOfEntitiesInJsonFile):\n", + " try:\n", + " if allEntities[i].id:\n", + " entitiesArray.append(allEntities[i])\n", + " except:\n", + " for j in allEntities[i].properties:\n", + " entitiesArray.append(j)\n", + " # Sorting the entities using y-coordinates to order them according to the rows\n", + " entitiesArray = sorted(\n", + " entitiesArray,\n", + " key=lambda x: x.page_anchor.page_refs[0].bounding_poly.normalized_vertices[0].y,\n", + " )\n", + " newEntitiesArray = []\n", + " for i in entitiesArray:\n", + " print(\"-------------------\")\n", + " print(\"Parent : \", i.type, \" : \", i.mention_text)\n", + " # Sorting the properties of a single line item using x-coordinates to order them according to the table\n", + " if len(i.properties) > 0:\n", + " x2 = sorted(\n", + " i.properties,\n", + " key=lambda x: x.page_anchor.page_refs[0]\n", + " .bounding_poly.normalized_vertices[0]\n", + " .x,\n", + " )\n", + " j = 0\n", + " while j < len(x2):\n", + " k = 0\n", + " newEntity = documentai.Document.Entity() # Adding a new parentItem\n", + " newEntity.confidence = i.confidence\n", + " newEntity.mention_text = \"\"\n", + " xValues = []\n", + " yValues = []\n", + " properties = []\n", + " textSegments = []\n", + " while k < len(bankChecksColumn) and j < len(x2):\n", + " if x2[j].type == bankChecksColumn[k]:\n", + " newEntity.mention_text = (\n", + " newEntity.mention_text + x2[j].mention_text\n", + " )\n", + " for m in (\n", + " x2[j]\n", + " .page_anchor.page_refs[0]\n", + " .bounding_poly.normalized_vertices\n", + " ):\n", + " xValues.append(m.x)\n", + " yValues.append(m.y)\n", + " print(x2[j].type, \":\", x2[j].mention_text)\n", + " properties.append(x2[j])\n", + " textSegments.append(x2[j].text_anchor.text_segments[0])\n", + " else:\n", + " k += 1\n", + " continue\n", + " j += 1\n", + " if j < len(x2) and x2[j].type == bankChecksColumn[k]:\n", + " newEntity.mention_text = (\n", + " newEntity.mention_text + x2[j].mention_text\n", + " )\n", + " for m in (\n", + " x2[j]\n", + " .page_anchor.page_refs[0]\n", + " .bounding_poly.normalized_vertices\n", + " ):\n", + " xValues.append(m.x)\n", + " yValues.append(m.y)\n", + " print(x2[j].type, \":\", x2[j].mention_text)\n", + " properties.append(x2[j])\n", + " textSegments.append(x2[j].text_anchor.text_segments[0])\n", + " j += 1\n", + "\n", + " k += 1\n", + " # j+=1\n", + " # if len(xValues)>0:\n", + " xParentMax = max(xValues)\n", + " xParentMin = min(xValues)\n", + " yParentMax = max(yValues)\n", + " yParentMin = min(yValues)\n", + " if i.page_anchor.page_refs[0].page:\n", + " newEntity.page_anchor = {\n", + " \"page_refs\": [\n", + " {\n", + " \"bounding_poly\": {\n", + " \"normalized_vertices\": [\n", + " {\"x\": xParentMin, \"y\": yParentMin},\n", + " {\"x\": xParentMax, \"y\": yParentMin},\n", + " {\"x\": xParentMax, \"y\": yParentMax},\n", + " {\"x\": xParentMin, \"y\": yParentMax},\n", + " ]\n", + " },\n", + " \"page\": i.page_anchor.page_refs[0].page,\n", + " }\n", + " ]\n", + " }\n", + " else:\n", + " newEntity.page_anchor = {\n", + " \"page_refs\": [\n", + " {\n", + " \"bounding_poly\": {\n", + " \"normalized_vertices\": [\n", + " {\"x\": xParentMin, \"y\": yParentMin},\n", + " {\"x\": xParentMax, \"y\": yParentMin},\n", + " {\"x\": xParentMax, \"y\": yParentMax},\n", + " {\"x\": xParentMin, \"y\": yParentMax},\n", + " ]\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " newEntity.properties = properties\n", + " newEntity.text_anchor = {\"text_segments\": textSegments}\n", + " newEntity.type = i.type\n", + " print(\"*****************\")\n", + " print(newEntity)\n", + " newEntitiesArray.append(newEntity)\n", + " print(\"*****************\")\n", + " entitiesArray = newEntitiesArray\n", + " for e in entitiesArray:\n", + " if e.properties:\n", + " if len(e.properties) > 0:\n", + " for j in e.properties:\n", + " if j.id:\n", + " del j.id\n", + " for e in entitiesArray:\n", + " if e.properties:\n", + " if len(e.properties) > 0:\n", + " for j in e.properties:\n", + " if j.type:\n", + " print(e.type)\n", + " j.type = e.type + \"/\" + j.type\n", + " jsonData.entities = entitiesArray\n", + " return jsonData\n", + "\n", + "\n", + "def jsonToResultDf(jsonData: documentai.Document, jsonFileName: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Convert the document proto into a csv report with file name with the entity,id, confidence and text\n", + "\n", + " Parameters\n", + " ----------\n", + " jsonData : documentai.Document\n", + " The document proto with the updated data.\n", + " jsonFileName :str\n", + " Document file name.\n", + "\n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " Returns the csv report with the required columns.\n", + " \"\"\"\n", + " import pandas as pd\n", + "\n", + " allEntities = jsonData.entities\n", + " noOfEntitiesInJsonFile = len(allEntities)\n", + "\n", + " def get_jsonDict_prop(allEntities, i):\n", + " jsonDict_prop = documentai.Document.Entity()\n", + " try:\n", + " jsonDict_prop.id = allEntities[i].id\n", + "\n", + " except:\n", + " jsonDict_prop.id = \"\"\n", + " try:\n", + " jsonDict_prop.type = allEntities[i].type\n", + " except:\n", + " jsonDict_prop.type = \"\"\n", + " try:\n", + " jsonDict_prop.confidence = allEntities[i].confidence\n", + " except:\n", + " jsonDict_prop.confidence = \"\"\n", + " try:\n", + " jsonDict_prop.mention_text = allEntities[i].mention_text\n", + " except:\n", + " jsonDict_prop.mention_text = \"\"\n", + " return jsonDict_prop\n", + "\n", + " # Single Level Entities file : jsonDict\n", + " jsonDict = {\n", + " \"File Name\": [],\n", + " \"ID\": [],\n", + " \"Entity Type\": [],\n", + " \"Confidence\": [],\n", + " \"Text\": [],\n", + " }\n", + "\n", + " entitiesArray = []\n", + "\n", + " for i in range(0, noOfEntitiesInJsonFile):\n", + " try:\n", + " if allEntities[i].id:\n", + " entitiesArray.append(allEntities[i])\n", + " except:\n", + " try:\n", + " if allEntities[i].properties:\n", + " json_dict_temp = get_jsonDict_prop(allEntities, i)\n", + "\n", + " entitiesArray.append(json_dict_temp)\n", + " for j in allEntities[i].properties:\n", + " entitiesArray.append(j)\n", + " except:\n", + " entitiesArray.append(\n", + " {\n", + " \"type\": allEntities[i].type,\n", + " \"mention_text\": allEntities[i].mention_text,\n", + " }\n", + " )\n", + "\n", + " for i in range(0, len(entitiesArray)):\n", + " jsonDict[\"File Name\"].append(jsonFileName)\n", + " try:\n", + " jsonDict[\"ID\"].append(entitiesArray[i].id)\n", + " except:\n", + " jsonDict[\"ID\"].append(\"\")\n", + " try:\n", + " jsonDict[\"Entity Type\"].append(entitiesArray[i].type)\n", + " except:\n", + " jsonDict[\"Entity Type\"].append(\"\")\n", + " try:\n", + " jsonDict[\"Confidence\"].append(entitiesArray[i].confidence)\n", + " except:\n", + " jsonDict[\"Confidence\"].append(None)\n", + " try:\n", + " jsonDict[\"Text\"].append(entitiesArray[i].mention_text)\n", + " except:\n", + " jsonDict[\"Text\"].append(\"\")\n", + "\n", + " df = pd.DataFrame(jsonDict)\n", + " return df\n", + "\n", + "\n", + "# Asynchronous processing of files using Bank statement parser provided\n", + "logger(\n", + " \"logging.txt\",\n", + " \"----------------------------------------LOGGING STARTED----------------------------------------\",\n", + ")\n", + "try:\n", + " temp_intital_pdfpath, temp_pdffolder, temp_pdfbucket = get_files_not_parsed(\n", + " gcs_input_dir, gcs_output_dir\n", + " )\n", + " logger(\"logging.txt\", \"Batch processing the documents.......\")\n", + " res = batch_process_documents_sample(\n", + " project_id, \"us\", processor_id, temp_intital_pdfpath, gcs_output_dir\n", + " )\n", + " logger(\"logging.txt\", \"Batch processing of documents done\")\n", + " delete_folder(temp_pdfbucket, temp_pdffolder)\n", + "except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Files are not processed because of error message--> {}\".format(e),\n", + " )\n", + " delete_folder(temp_pdfbucket, temp_pdffolder)\n", + " pass\n", + "# Getting bucket names and prefixes for further use\n", + "Input_bucket_name = gcs_input_dir.split(\"/\")[2]\n", + "prefix_input_files = \"/\".join(gcs_input_dir.split(\"/\")[3:])\n", + "Output_bucket_name = gcs_output_dir.split(\"/\")[2]\n", + "prefix_output_files = \"/\".join(gcs_output_dir.split(\"/\")[3:])\n", + "New_output_json_bucket = gcs_new_output_json_path.split(\"/\")[2]\n", + "New_prefix_output_jsons = \"/\".join(gcs_new_output_json_path.split(\"/\")[3:])\n", + "\n", + "\n", + "df3 = pd.DataFrame()\n", + "try:\n", + " temp_json_path, temp_json_folder, temp_json_bucket = get_files_not_postparsed(\n", + " gcs_output_dir, gcs_new_output_json_path\n", + " )\n", + " json_f, file_dict = file_names(temp_json_path)\n", + " json_files = list(file_dict.values())\n", + " logger(\"logging.txt\", \"list of json files prepared in the output folder\")\n", + " # delete_folder(temp_json_bucket,temp_json_folder)\n", + " try:\n", + " for i in range(len(json_f)):\n", + " try:\n", + " temp_json_gcs_path = temp_json_path + \"/\" + json_f[i]\n", + " temp_bucket_name = temp_json_gcs_path.split(\"/\")[2]\n", + " prefix_temp_file_name = \"/\".join(temp_json_gcs_path.split(\"/\")[3:])\n", + " document = documentai_json_proto_downloader(\n", + " temp_bucket_name, prefix_temp_file_name\n", + " )\n", + " logger(\n", + " \"logging.txt\",\n", + " \"loaded json file--||{}||-- from the output GCS folder\".format(\n", + " json_f[i]\n", + " ),\n", + " )\n", + "\n", + " try:\n", + " maxId = maxIdFinder(document)\n", + " logger(\"logging.txt\", \"Getting the Max id in the json data\")\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt get the Max id because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " try:\n", + " document = Boundary_markers(document)\n", + " logger(\"logging.txt\", \"account number, transactions are renamed\")\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt rename account related entities because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " try:\n", + " document = fixAccountBalance(document)\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Account balance (starting and ending balance ) is fixed\",\n", + " )\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt fix starting and ending balance because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " try:\n", + " document = accounttype_change(document)\n", + " logger(\"logging.txt\", \"account_type is changed to account_name\")\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt change account_type is changed to account_name because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " try:\n", + " document = borrowerNameFix(document)\n", + " logger(\n", + " \"logging.txt\",\n", + " \"entities- client_name is split into first_name, last_name and middle_name if available\",\n", + " )\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt split the client_name because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " try:\n", + " for key in dict_ent_rename.keys():\n", + " document = ent_rename(document, key, dict_ent_rename[key])\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Some entities are renamed as per gatless names given\",\n", + " )\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt rename some entities because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " try:\n", + " document = address_function(document)\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Client address is split into street name,zip code and city\",\n", + " )\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt split the Client address because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " try:\n", + " document = add_total_pages(document) # TODO : here will start\n", + " logger(\"logging.txt\", \"Adding total pages entity into json\")\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt radd total pages entity into pages because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " pass\n", + " try:\n", + " if checksFlag == True:\n", + " Financial_Institution = \"\"\n", + " for e in document.entities:\n", + " if e.type == \"Financial_Institution\":\n", + " Financial_Institution = e.mention_text\n", + " break\n", + " file_path = gcs_input_dir + \"/\" + json_f[i][:-7] + \".pdf\"\n", + " file_bucket_name = file_path.split(\"/\")[2]\n", + " prefex_file_path = \"/\".join(file_path.split(\"/\")[3:])\n", + " Financial_Institution = \"\".join(\n", + " Financial_Institution.strip().split()\n", + " ).lower()\n", + " if Financial_Institution in [\n", + " \"wellsfargo\",\n", + " \"chase\",\n", + " \"bankofamerica\",\n", + " ]:\n", + " storage_client = storage.Client()\n", + " bucket = storage_client.bucket(file_bucket_name)\n", + " blob = bucket.blob(prefex_file_path)\n", + " pdf_bytes = blob.download_as_string()\n", + " check_json = process_document_sample(\n", + " project_id,\n", + " \"us\",\n", + " processor_id_checks,\n", + " file_path,\n", + " pdf_bytes,\n", + " processor_version_checks,\n", + " )\n", + " check_json_output = groupChecks(\n", + " check_json.document, Financial_Institution\n", + " )\n", + " combined_entities = (\n", + " document.entities + check_json_output.entities\n", + " )\n", + " document.entities = combined_entities\n", + " # fs.pipe(gcs_new_output_checks_json_path+json_f[i].split('/')[-1],bytes(json.dumps(check_json_output,ensure_ascii=False),'utf-8'),content_type='application/json')\n", + " logger(\"logging.txt\", \"Checks Details are added \")\n", + " else:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Financial Institution is out of Scope--> {}\".format(\n", + " Financial_Institution\n", + " ),\n", + " )\n", + " pass\n", + " else:\n", + " pass\n", + " except:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt find Checks Detail due to error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " try:\n", + " final_files_list, final_files_dict = file_names(\n", + " gcs_new_output_json_path\n", + " )\n", + " store_document_as_json(\n", + " documentai.Document.to_json(document),\n", + " New_output_json_bucket,\n", + " New_prefix_output_jsons + final_files_list[i],\n", + " )\n", + " logger(\n", + " \"logging.txt\",\n", + " \"post processed json files are moved to gcs postprocessed folder provided\",\n", + " )\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt upload the post processed json file because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " continue\n", + " except Exception as e:\n", + " print(e)\n", + " continue\n", + " except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt load json file because of error message--> {}\".format(e),\n", + " )\n", + " pass\n", + "except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt get list of json files because of error message--> {}\".format(e),\n", + " )\n", + " delete_folder(temp_json_bucket, temp_json_folder)\n", + " pass\n", + "\n", + "# changing meta data of post processed files\n", + "\n", + "try:\n", + " !gsutil -m setmeta -h \"content-Type:application/json\" {gcs_new_output_json_path}*\n", + " logger(\n", + " \"logging.txt\",\n", + " \"meta data for post processed json files changed to application/json \",\n", + " )\n", + "except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt update meta data for post processed json files changed to application/json because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + "\n", + "delete_folder(temp_json_bucket, temp_json_folder)\n", + "# creating data frame and saving data into csv\n", + "try:\n", + " logger(\"logging.txt\", \"creating dataframe to create consolidated csv\")\n", + " final_files_list, final_files_dict = file_names(gcs_new_output_json_path)\n", + " for i in range(len(final_files_list)):\n", + " json_2 = documentai_json_proto_downloader(\n", + " New_output_json_bucket, New_prefix_output_jsons + final_files_list[i]\n", + " )\n", + " df1 = jsonToResultDf(json_2, final_files_list[i])\n", + " df2 = df1\n", + " df3 = pd.concat([df3, df2], ignore_index=True)\n", + "except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"failed to create dataframe to create consolidated csv because of error--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + " print(e)\n", + "\n", + "try:\n", + " df3.to_csv(\"Consolidated.csv\")\n", + " logger(\"logging.txt\", \"Consolidated CSV file created\")\n", + "except Exception as e:\n", + " logger(\n", + " \"logging.txt\",\n", + " \"Couldnt create consolidated CSV file because of error message--> {}\".format(\n", + " e\n", + " ),\n", + " )\n", + "\n", + "\n", + "logger(\n", + " \"logging.txt\",\n", + " \"----------------------------------------END OF POST PROCESSING----------------------------------------\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6fa3e445-337d-4500-9daf-988be7ffc098", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f5318c8-59be-46ba-898f-8088c5a980d5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image1.png b/incubator-tools/bank_statement_post_processing_tool/images/image1.png new file mode 100644 index 000000000..f774cf60c Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image1.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image10.png b/incubator-tools/bank_statement_post_processing_tool/images/image10.png new file mode 100644 index 000000000..45442d40d Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image10.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image2.png b/incubator-tools/bank_statement_post_processing_tool/images/image2.png new file mode 100644 index 000000000..6619154cd Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image2.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image3.png b/incubator-tools/bank_statement_post_processing_tool/images/image3.png new file mode 100644 index 000000000..840540267 Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image3.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image4.png b/incubator-tools/bank_statement_post_processing_tool/images/image4.png new file mode 100644 index 000000000..5079b5efa Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image4.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image5.png b/incubator-tools/bank_statement_post_processing_tool/images/image5.png new file mode 100644 index 000000000..c53718044 Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image5.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image6.png b/incubator-tools/bank_statement_post_processing_tool/images/image6.png new file mode 100644 index 000000000..edb48175f Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image6.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image7.png b/incubator-tools/bank_statement_post_processing_tool/images/image7.png new file mode 100644 index 000000000..f27257532 Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image7.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image8.png b/incubator-tools/bank_statement_post_processing_tool/images/image8.png new file mode 100644 index 000000000..42ab3f3d9 Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image8.png differ diff --git a/incubator-tools/bank_statement_post_processing_tool/images/image9.png b/incubator-tools/bank_statement_post_processing_tool/images/image9.png new file mode 100644 index 000000000..a8897be3e Binary files /dev/null and b/incubator-tools/bank_statement_post_processing_tool/images/image9.png differ diff --git a/incubator-tools/bank_statements_line_items_improver_and_missing_items_finder/README.md b/incubator-tools/bank_statements_line_items_improver_and_missing_items_finder/README.md new file mode 100644 index 000000000..b46e26bfd --- /dev/null +++ b/incubator-tools/bank_statements_line_items_improver_and_missing_items_finder/README.md @@ -0,0 +1,15 @@ +# Purpose and Description + +The objective of the tool is to find the missing child items and group the correct child items into parent line items + +## Input Details + +* **Gcs_input_path** : GCS Input Path. It should contain DocAI processed output json files. +* **Gcs_output_path** : GCS Output Path. The updated jsons will be saved in output path. +* **project_id** : It should contains the project id of your current project. +* **parent_type** : Specify the parent entity type like table_item, line_item +* **Missing_items_flag**: "True" if we need to find the missing child items , missing items step will be skipped if this value is other than True + +## Output Details + +The missing fields will be detected from the existing line items and grouped and updated json is saved in ouput path diff --git a/incubator-tools/bank_statements_line_items_improver_and_missing_items_finder/bank_statements_line_items_improver_and_missing_items_finder.ipynb b/incubator-tools/bank_statements_line_items_improver_and_missing_items_finder/bank_statements_line_items_improver_and_missing_items_finder.ipynb new file mode 100644 index 000000000..b834821dd --- /dev/null +++ b/incubator-tools/bank_statements_line_items_improver_and_missing_items_finder/bank_statements_line_items_improver_and_missing_items_finder.ipynb @@ -0,0 +1,1149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6589fc93-39d1-4d10-be1f-e7eb33fe4087", + "metadata": {}, + "source": [ + "# Bank Statements Line items improver and Missing items finder\n" + ] + }, + { + "cell_type": "markdown", + "id": "5bf22bf7-4a47-4f3a-9eef-6f19348a5250", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "361f188e-fe11-4a49-b7c8-080e0e69ce7a", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied. \n" + ] + }, + { + "cell_type": "markdown", + "id": "1036937a-0221-48eb-862e-3fa0b8e646a8", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "The objective of the tool is to find the missing child items and group the correct child items into parent line items" + ] + }, + { + "cell_type": "markdown", + "id": "115a4e82-5e83-468a-b0e5-097ca14f15d5", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "* Vertex AI Notebook Or Colab (If using Colab, use authentication)\n", + "* Storage Bucket for storing input and output json files\n", + "* Permission For Google Storage and Vertex AI Notebook.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "fe81de40-5c62-4c0b-adea-937f957b1a6e", + "metadata": {}, + "source": [ + "## Step by Step procedure" + ] + }, + { + "cell_type": "markdown", + "id": "142123d3-37b1-4aa8-841c-40c3bd52d70c", + "metadata": {}, + "source": [ + "### 1. Importing Required Modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0643c5f9-29fe-4252-9e6c-e2afc8c2f2b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install pandas numpy google-cloud-storage google-cloud-documentai==2.16.0\n", + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7588c13e-0e09-4a76-8c21-85a68ee262c6", + "metadata": {}, + "outputs": [], + "source": [ + "# import libraries\n", + "from tqdm import tqdm\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "from pathlib import Path\n", + "from google.cloud import storage\n", + "from collections import Counter\n", + "from typing import Dict, List, Any, Tuple\n", + "from utilities import *" + ] + }, + { + "cell_type": "markdown", + "id": "fd7c8c4c-68b8-413c-b4bc-c66f044d3b7a", + "metadata": {}, + "source": [ + "### 2. Input and Output Paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47eb2160-f5b4-42da-acdb-1ba4babc2ec0", + "metadata": {}, + "outputs": [], + "source": [ + "# Path to the raw parsed JSON files. The path must end with a forward slash ('/').\n", + "Gcs_input_path = \"gs://xxxxx/xxxxxxxxxxxx/xx/\"\n", + "# Your Google Cloud project ID.\n", + "project_id = \"xxx-xxxx-xxxx\"\n", + "# Path for saving the processed output files. Do not include a trailing forward slash ('/').\n", + "Gcs_output_path = \"gs://xxxxx/xxxxxxxxxxxx/xx\"\n", + "parent_type = \"table_item\"\n", + "Missing_items_flag = \"True\" # case sensitive" + ] + }, + { + "cell_type": "markdown", + "id": "da0d4909-e00c-4704-a43b-6534f7403872", + "metadata": {}, + "source": [ + "* ``Gcs_input_path ``: GCS Input Path. It should contain DocAI processed output json files. \n", + "* ``Gcs_output_path ``: GCS Output Path. The updated jsons will be saved in output path. \n", + "* ``project_id`` : It should contains the project id of your current project.\n", + "* ``parent_type`` : Specify the parent entity type like table_item, line_item \n", + "* ``Missing_items_flag``: \"True\" if we need to find the missing child items , missing items step will be skipped if this value is other than True\n" + ] + }, + { + "cell_type": "markdown", + "id": "737d1c70-fef5-49e3-a266-695bf8076a54", + "metadata": {}, + "source": [ + "### 3. Run the Code" + ] + }, + { + "cell_type": "markdown", + "id": "6afc3e35-e12a-40c6-9f81-10460a9a9421", + "metadata": {}, + "source": [ + "### Note\n", + "* While using the missing items code , if the line items are closer then `modify the get_token_data function by increasing or decreasing the x and y allowances`.\n", + "* Human review is recomended after this tool usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c22bfdd-abdc-4d1c-8f7c-86164e7c4103", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_page_bbox(entity: documentai.Document.Entity):\n", + " \"\"\"\n", + " Get the bounding box (bbox) coordinates of a page entity.\n", + "\n", + " Args:\n", + " - entity : Document AI entity object.\n", + "\n", + " Returns:\n", + " - List[float]: A list containing four float values representing the coordinates of the bounding box.\n", + " The format is [min_x, min_y, max_x, max_y].\n", + " \"\"\"\n", + " bound_poly = entity.page_anchor.page_refs\n", + " norm_ver = bound_poly[0].bounding_poly.normalized_vertices\n", + " x_values = [vertex.x for vertex in norm_ver]\n", + " y_values = [vertex.y for vertex in norm_ver]\n", + " bbox = [min(x_values), min(y_values), max(x_values), max(y_values)]\n", + "\n", + " return bbox\n", + "\n", + "\n", + "def get_page_wise_entities(json_dict: documentai.Document):\n", + " \"\"\"\n", + " Extracts entities from a loaded JSON file and organizes them based on the page they belong to.\n", + "\n", + " Args:\n", + " - json_dict : documentai object.\n", + "\n", + " Returns:\n", + " - Dict[int, List[Dict[str, Any]]]: A dictionary where keys represent page numbers and values are lists\n", + " of entities belonging to that page.\n", + " \"\"\"\n", + "\n", + " entities_page = {}\n", + " for entity in json_dict.entities:\n", + " page = entity.page_anchor.page_refs[0].page\n", + " if page in entities_page.keys():\n", + " entities_page[page].append(entity)\n", + " else:\n", + " entities_page[page] = [entity]\n", + "\n", + " return entities_page\n", + "\n", + "\n", + "def get_line_items_schema(line_items: Any):\n", + " \"\"\"\n", + " Generate a schema for line items along with their corresponding positions on the page.\n", + "\n", + " Args:\n", + " - line_items (List[Dict[str, Any]]): A list of line items extracted from the JSON entities.\n", + "\n", + " Returns:\n", + " - Tuple[Dict[str, int], Dict[str, List[List[float]]], Dict[str, List[List[float]]]]: A tuple containing:\n", + " 1. A dictionary representing the schema for line items with the count of each type.\n", + " 2. A dictionary representing the x positions of line items for each type.\n", + " 3. A dictionary representing the y positions of line items for each type.\n", + " \"\"\"\n", + " # line_items = [entity for entity in json_dict.entities if entity.properties]\n", + " line_item_schema = []\n", + " schema_xy = []\n", + " for line_item in line_items:\n", + " temp_schema = {}\n", + " temp_xy = {}\n", + " for item in line_item.properties:\n", + " temp_schema[item.type] = temp_schema.get(item.type, 0) + 1\n", + " bbox = get_page_bbox(item)\n", + " if item.type in temp_xy:\n", + " temp_xy[item.type].append(bbox)\n", + " else:\n", + " temp_xy[item.type] = [bbox]\n", + "\n", + " line_item_schema.append(temp_schema)\n", + " schema_xy.append(temp_xy)\n", + "\n", + " flat_list = [\n", + " (key, value) for item in line_item_schema for key, value in item.items()\n", + " ]\n", + "\n", + " counter = Counter(dict(flat_list))\n", + " temp_schema_dict = dict(counter)\n", + " consolidated_positions_ent = {}\n", + " x = []\n", + " for k3, v3 in temp_schema_dict.items():\n", + " for l3 in schema_xy:\n", + " for k4, v4 in l3.items():\n", + " if k3 == k4:\n", + " for x12 in v4:\n", + " if k3 in consolidated_positions_ent.keys():\n", + " consolidated_positions_ent[k3].append(x12)\n", + " else:\n", + " consolidated_positions_ent[k3] = [x12]\n", + " final_ent_x12 = {}\n", + " final_ent_y12 = {}\n", + " for ent_typ, va1 in consolidated_positions_ent.items():\n", + " sorted_data = sorted(va1, key=lambda x: x[0])\n", + " groups = []\n", + " current_group = [sorted_data[0]]\n", + " difference_threshold = 0.02\n", + " for i in range(1, len(sorted_data)):\n", + " if abs(sorted_data[i][0] - current_group[-1][0]) <= difference_threshold:\n", + " current_group.append(sorted_data[i])\n", + " else:\n", + " groups.append(current_group)\n", + " current_group = [sorted_data[i]]\n", + " groups.append(current_group)\n", + " for va3 in groups:\n", + " if len(va3) >= 1:\n", + " if ent_typ in final_ent_x12.keys():\n", + " final_ent_x12[ent_typ].append(\n", + " [min(item[0] for item in va3), max(item[2] for item in va3)]\n", + " )\n", + " final_ent_y12[ent_typ].append(\n", + " [min(item[1] for item in va3), max(item[3] for item in va3)]\n", + " )\n", + " else:\n", + " final_ent_x12[ent_typ] = [\n", + " [min(item[0] for item in va3), max(item[2] for item in va3)]\n", + " ]\n", + " final_ent_y12[ent_typ] = [\n", + " [min(item[1] for item in va3), max(item[3] for item in va3)]\n", + " ]\n", + "\n", + " return temp_schema_dict, final_ent_x12, final_ent_y12\n", + "\n", + "\n", + "def get_token_xy(token: Any) -> Tuple[float, float, float, float]:\n", + " \"\"\"\n", + " Extracts the normalized bounding box coordinates (min_x, min_y, max_x, max_y) of a token.\n", + "\n", + " Args:\n", + " - token (Any): A token object with layout information.\n", + "\n", + " Returns:\n", + " - Tuple[float, float, float, float]: The normalized bounding box coordinates.\n", + "\n", + " \"\"\"\n", + " vertices = token.layout.bounding_poly.normalized_vertices\n", + " minx_token, miny_token = min(point.x for point in vertices), min(\n", + " point.y for point in vertices\n", + " )\n", + " maxx_token, maxy_token = max(point.x for point in vertices), max(\n", + " point.y for point in vertices\n", + " )\n", + "\n", + " return minx_token, miny_token, maxx_token, maxy_token\n", + "\n", + "\n", + "def get_token_data(\n", + " json_dict: documentai.Document,\n", + " min_x: float,\n", + " max_x: float,\n", + " min_y: float,\n", + " max_y: float,\n", + " page_num: int,\n", + "):\n", + " \"\"\"\n", + " Extracts token data from the JSON dictionary based on provided bounding box coordinates and page number.\n", + "\n", + " Args:\n", + " - json_dict (Dict[str, Any]): The JSON dictionary containing token data.\n", + " - min_x (float): Minimum x-coordinate of the bounding box.\n", + " - max_x (float): Maximum x-coordinate of the bounding box.\n", + " - min_y (float): Minimum y-coordinate of the bounding box.\n", + " - max_y (float): Maximum y-coordinate of the bounding box.\n", + " - page_num (int): Page number.\n", + "\n", + " Returns:\n", + " - Tuple[str, List[Dict[str, Any]], List[Dict[str, float]]]: A tuple containing:\n", + " 1. The extracted text from the tokens.\n", + " 2. A list of dictionaries containing text anchor data for each token.\n", + " 3. A list of dictionaries containing page anchor data.\n", + " \"\"\"\n", + " text_anc_temp = []\n", + " text_anc = []\n", + " page_anc_temp = {\"x\": [], \"y\": []}\n", + " y_allowance = (\n", + " 0.01 # edit this if the line items are closer and your not getitng desir\n", + " )\n", + " x_allowance = 0.02\n", + " for page in json_dict.pages:\n", + " if page_num == page.page_number - 1:\n", + " for token in page.tokens:\n", + " minx_token, miny_token, maxx_token, maxy_token = get_token_xy(token)\n", + " if (\n", + " min_y <= miny_token + y_allowance\n", + " and max_y >= maxy_token - y_allowance\n", + " and min_x <= minx_token + x_allowance\n", + " and max_x >= maxx_token - x_allowance\n", + " ):\n", + " temp_anc = token.layout.text_anchor.text_segments[0]\n", + " text_anc.append(temp_anc)\n", + " page_anc_temp[\"x\"].extend([minx_token, maxx_token])\n", + " page_anc_temp[\"y\"].extend([miny_token, maxy_token])\n", + " for seg in token.layout.text_anchor.text_segments:\n", + " text_anc_temp.append([seg.start_index, seg.end_index])\n", + " if page_anc_temp != {\"x\": [], \"y\": []}:\n", + " page_anc = [\n", + " {\"x\": min(page_anc_temp[\"x\"]), \"y\": min(page_anc_temp[\"y\"])},\n", + " {\"x\": max(page_anc_temp[\"x\"]), \"y\": min(page_anc_temp[\"y\"])},\n", + " {\"x\": min(page_anc_temp[\"x\"]), \"y\": max(page_anc_temp[\"y\"])},\n", + " {\"x\": max(page_anc_temp[\"x\"]), \"y\": max(page_anc_temp[\"y\"])},\n", + " ]\n", + " if text_anc_temp != []:\n", + " sorted_data = sorted(text_anc_temp, key=lambda x: x[0])\n", + " mention_text = \"\"\n", + " for start_index, end_index in sorted_data:\n", + " mention_text += json_dict.text[start_index:end_index]\n", + "\n", + " return mention_text, text_anc, page_anc\n", + "\n", + "\n", + "def get_missing_fields(\n", + " json_dict,\n", + " line_items,\n", + " temp_schema_dict,\n", + " final_ent_x12,\n", + " ent_x_region,\n", + " line_item_y_region,\n", + "):\n", + " \"\"\"\n", + " Identifies missing fields in line items and fills them based on provided criteria.\n", + "\n", + " Args:\n", + " - json_dict (Dict[str, Any]): The JSON dictionary containing relevant data.\n", + " - line_items (List[Any]): The list of line items to be processed.\n", + " - temp_schema_dict (Dict[str, int]): The schema representing the count of each type.\n", + " - final_ent_x12 (Dict[str, List[List[float]]]): The x positions of line items for each type.\n", + " - ent_x_region (Dict[str, List[float]]): The x positions of entities for each type.\n", + " - line_item_y_region (List[float]): The y positions of line items.\n", + "\n", + " Returns:\n", + " - List[Any]: The updated list of line items.\n", + " \"\"\"\n", + " for line_item in line_items:\n", + " import copy\n", + "\n", + " page_num = 0\n", + " temp_types = []\n", + " mis_type = []\n", + " text_anc_line = []\n", + " text_anc_mt = []\n", + " page_anc_line = {\"x\": [], \"y\": []}\n", + " deep_copy_temp_schema = copy.deepcopy(temp_schema_dict)\n", + "\n", + " for child in line_item.properties:\n", + " temp_types.append(child.type)\n", + " for seg in child.text_anchor.text_segments:\n", + " text_anc_line.append(seg)\n", + " text_anc_mt.append([seg.start_index, seg.end_index])\n", + " for anc4 in child.page_anchor.page_refs:\n", + " # page_n=anc4.page\n", + " for xy2 in anc4.bounding_poly.normalized_vertices:\n", + " page_anc_line[\"x\"].append(xy2.x)\n", + " page_anc_line[\"y\"].append(xy2.y)\n", + " # only for bank statement parser output\n", + " for k2 in temp_types:\n", + " if \"deposit\" in k2:\n", + " modified_schema = {\n", + " key: value\n", + " for key, value in deep_copy_temp_schema.items()\n", + " if \"withdrawal\" not in key\n", + " }\n", + " break\n", + " elif \"withdrawal\" in k2:\n", + " modified_schema = {\n", + " key: value\n", + " for key, value in deep_copy_temp_schema.items()\n", + " if \"deposit\" not in key\n", + " }\n", + " break\n", + " if \"modified_schema\" not in locals():\n", + " modified_schema = deep_copy_temp_schema\n", + "\n", + " for t1, v1 in modified_schema.items():\n", + " if t1 in temp_types:\n", + " pass\n", + " else:\n", + " mis_type.append(t1)\n", + "\n", + " if len(mis_type) > 0:\n", + " for typ in mis_type:\n", + " for ent_pos in line_item.page_anchor.page_refs:\n", + " page_num = ent_pos.page\n", + " try:\n", + " min_x = ent_x_region[typ][0]\n", + " except:\n", + " min_x = min(\n", + " ver.x for ver in ent_pos.bounding_poly.normalized_vertices\n", + " )\n", + " min_y = min(\n", + " ver.y for ver in ent_pos.bounding_poly.normalized_vertices\n", + " )\n", + " try:\n", + " max_x = ent_x_region[typ][1] - 0.02\n", + " except:\n", + " max_x = (\n", + " max(\n", + " ver.x\n", + " for ver in ent_pos.bounding_poly.normalized_vertices\n", + " )\n", + " - 0.02\n", + " )\n", + "\n", + " if \"description\" in typ:\n", + " try:\n", + " closest_index_y = min(\n", + " range(len(line_item_y_region)),\n", + " key=lambda i: abs(line_item_y_region[i] - min_y),\n", + " )\n", + "\n", + " max_y = line_item_y_region[closest_index_y + 1]\n", + " except:\n", + " pass\n", + " else:\n", + " max_y = max(\n", + " ver.y for ver in ent_pos.bounding_poly.normalized_vertices\n", + " )\n", + "\n", + " try:\n", + " mention_text, text_anc, page_anc = get_token_data(\n", + " json_dict, min_x, max_x, min_y, max_y, page_num\n", + " )\n", + " for an3 in text_anc:\n", + " text_anc_line.append(an3)\n", + " text_anc_mt.append([an3.start_index, an3.end_index])\n", + " for xy3 in page_anc:\n", + " page_anc_line[\"x\"].append(xy3[\"x\"])\n", + " page_anc_line[\"y\"].append(xy3[\"y\"])\n", + " entity_new = {\n", + " \"mention_text\": mention_text,\n", + " \"page_anchor\": {\n", + " \"page_refs\": [\n", + " {\n", + " \"bounding_poly\": {\n", + " \"normalized_vertices\": page_anc\n", + " },\n", + " \"page\": str(page_num),\n", + " }\n", + " ]\n", + " },\n", + " \"text_anchor\": {\n", + " \"content\": mention_text,\n", + " \"text_segments\": text_anc,\n", + " },\n", + " \"type\": typ,\n", + " }\n", + " line_item.properties.append(entity_new)\n", + " # print(typ)\n", + " # print(mention_text)\n", + " except Exception as e:\n", + " pass\n", + " page_anc_final = [\n", + " {\"x\": min(page_anc_line[\"x\"]), \"y\": min(page_anc_line[\"y\"])},\n", + " {\"x\": max(page_anc_line[\"x\"]), \"y\": min(page_anc_line[\"y\"])},\n", + " {\"x\": min(page_anc_line[\"x\"]), \"y\": max(page_anc_line[\"y\"])},\n", + " {\"x\": max(page_anc_line[\"x\"]), \"y\": max(page_anc_line[\"y\"])},\n", + " ]\n", + " sorted_data_1 = sorted(text_anc_mt, key=lambda x: x[0])\n", + " mention_text_final = \"\"\n", + " for start_index_1, end_index_1 in sorted_data_1:\n", + " mention_text_final = (\n", + " mention_text_final + \" \" + json_dict.text[start_index_1:end_index_1]\n", + " )\n", + "\n", + " line_item.mention_text = mention_text_final\n", + " for anc6 in line_item.page_anchor.page_refs:\n", + " anc6.bounding_poly.normalized_vertices = page_anc_final\n", + " line_item.text_anchor.text_segments = text_anc_line\n", + "\n", + " new_ent = []\n", + "\n", + " for l1 in line_items:\n", + " new_ent.append(l1)\n", + "\n", + " return new_ent\n", + "\n", + "\n", + "def get_schema_with_bbox(line_items: List[Dict[str, Any]]):\n", + " \"\"\"\n", + " Generates a schema for line items along with their bounding box coordinates.\n", + "\n", + " Args:\n", + " - line_items (List[Dict[str, Any]]): A list of line items.\n", + "\n", + " Returns:\n", + " - Tuple[List[Dict[str, int]], List[Dict[str, List[List[float]]]]]: A tuple containing:\n", + " 1. A list of dictionaries representing the schema for line items with the count of each type.\n", + " 2. A list of dictionaries representing the bounding box coordinates of line items for each type.\n", + " \"\"\"\n", + " line_item_schema = []\n", + " schema_xy = []\n", + " for line_item in line_items:\n", + " temp_schema = {}\n", + " temp_xy = {}\n", + " for item in line_item.properties:\n", + " temp_schema[item.type] = temp_schema.get(item.type, 0) + 1\n", + " bbox = get_page_bbox(item)\n", + " if item.type in temp_xy:\n", + " temp_xy[item.type].append(bbox)\n", + " else:\n", + " temp_xy[item.type] = [bbox]\n", + "\n", + " line_item_schema.append(temp_schema)\n", + " schema_xy.append(temp_xy)\n", + "\n", + " return line_item_schema, schema_xy\n", + "\n", + "\n", + "def get_anchor_entity(\n", + " schema_xy: List[Dict[str, List[List[float]]]],\n", + " line_item_schema: List[Dict[str, int]],\n", + ") -> str:\n", + " \"\"\"\n", + " Identifies the anchor entity among the entities based on certain criteria.\n", + "\n", + " Args:\n", + " - schema_xy (List[Dict[str, List[List[float]]]]): A list of dictionaries representing the bounding box coordinates\n", + " of line items for each type.\n", + " - line_item_schema (List[Dict[str, int]]): A list of dictionaries representing the schema for line items with\n", + " the count of each type.\n", + "\n", + " Returns:\n", + " - str: The anchor entity.\n", + " \"\"\"\n", + " ent_y2 = {}\n", + " for sc1 in schema_xy:\n", + " for e2, bbox in sc1.items():\n", + " if len(bbox) == 1:\n", + " for b2 in bbox:\n", + " ent_y2.setdefault(e2, []).extend([b2[1], b2[3]])\n", + " # get the min and max y of entities\n", + " entity_min_max_y = {}\n", + " for en3, val3 in ent_y2.items():\n", + " min_y_3 = min(val3)\n", + " max_y_3 = max(val3)\n", + " entity_min_max_y[en3] = [min_y_3, max_y_3]\n", + "\n", + " # counting times the entity appeared uniquely in all the line items\n", + " entity_count = {}\n", + " for entry in line_item_schema:\n", + " for entity, value in entry.items():\n", + " if value == 1:\n", + " if entity in entity_count:\n", + " entity_count[entity] += 1\n", + " else:\n", + " entity_count[entity] = 1\n", + "\n", + " value_counts = {}\n", + " for value in entity_count.values():\n", + " value_counts[value] = value_counts.get(value, 0) + 1\n", + " # Find the maximum value\n", + " max_value = max(value_counts.values())\n", + "\n", + " # Find keys with the maximum value\n", + " keys_with_max_value = [\n", + " key for key, value in value_counts.items() if value == max_value\n", + " ]\n", + "\n", + " # Find the key with the maximum value (in case of ties, choose the maximum key)\n", + " max_key = max(keys_with_max_value)\n", + "\n", + " repeated_key = [key for key, value in entity_count.items() if value == max_key]\n", + "\n", + " filtered_entities = {\n", + " key: entity_min_max_y[key] for key in repeated_key if key in entity_min_max_y\n", + " }\n", + "\n", + " if len(filtered_entities) > 1:\n", + " anchor_entity = min(filtered_entities, key=lambda k: filtered_entities[k][0])\n", + " else:\n", + " anchor_entity = list(filtered_entities.keys())[0]\n", + "\n", + " return anchor_entity\n", + "\n", + "\n", + "def entity_region_x(\n", + " schema_xy: List[Dict[str, List[List[float]]]]\n", + ") -> Dict[str, List[float]]:\n", + " \"\"\"\n", + " Calculates the x-regions for different types of entities based on their bounding boxes.\n", + "\n", + " Args:\n", + " - schema_xy (List[Dict[str, List[List[float]]]]): A list of dictionaries representing the bounding box coordinates\n", + " of line items for each type.\n", + "\n", + " Returns:\n", + " - Dict[str, List[float]]: A dictionary containing the x-regions for different types of entities.\n", + " \"\"\"\n", + "\n", + " def get_margin(min_y_bin: List[float], min_values: str = \"YES\") -> float:\n", + " \"\"\"\n", + " Computes the margin based on the minimum y-bin values.\n", + "\n", + " Args:\n", + " - min_y_bin (List[float]): List of minimum y-bin values.\n", + " - min_values (str): A flag indicating whether to compute the minimum value.\n", + "\n", + " Returns:\n", + " - float: The computed margin.\n", + " \"\"\"\n", + " # Sort the list in ascending order\n", + " min_y_bin.sort()\n", + "\n", + " bins = []\n", + " current_bin = [min_y_bin[0]]\n", + " # Iterate through the values to create bins\n", + " for i in range(1, len(min_y_bin)):\n", + " if min_y_bin[i] - current_bin[-1] < 0.05:\n", + " current_bin.append(min_y_bin[i])\n", + " else:\n", + " bins.append(current_bin.copy())\n", + " current_bin = [min_y_bin[i]]\n", + "\n", + " # Add the last bin\n", + " bins.append(current_bin)\n", + " final_bins = []\n", + " for bin_1 in bins:\n", + " if len(bin_1) >= 2:\n", + " final_bins.append(bin_1)\n", + " if final_bins == []:\n", + " for bin_1 in bins:\n", + " if len(bin_1) >= 1:\n", + " final_bins.append(bin_1)\n", + " if min_values == \"YES\":\n", + " return min(min(inner_list) for inner_list in final_bins)\n", + " else:\n", + " return max(max(inner_list) for inner_list in final_bins)\n", + "\n", + " ent_full_boundries = {}\n", + " for line_1 in schema_xy:\n", + " for typ_1, bbox_1 in line_1.items():\n", + " if len(bbox_1) == 1:\n", + " if typ_1 in ent_full_boundries.keys():\n", + " ent_full_boundries[typ_1].append(bbox_1[0])\n", + " else:\n", + " ent_full_boundries[typ_1] = bbox_1\n", + " ent_margins = {}\n", + " for ent_typ_1, values_1 in ent_full_boundries.items():\n", + " min_x_bin = []\n", + " min_y_bin = []\n", + " max_x_bin = []\n", + " max_y_bin = []\n", + " min_check = len(values_1)\n", + " for bbox in values_1:\n", + " min_x_bin.append(bbox[0])\n", + " min_y_bin.append(bbox[1])\n", + " max_x_bin.append(bbox[2])\n", + " max_y_bin.append(bbox[3])\n", + " min_x = get_margin(min_x_bin, min_values=\"YES\")\n", + " min_y = get_margin(min_y_bin, min_values=\"YES\")\n", + " max_x = get_margin(max_x_bin, min_values=\"NO\")\n", + " max_y = get_margin(max_y_bin, min_values=\"NO\")\n", + "\n", + " ent_margins[ent_typ_1] = [min_x, min_y, max_x, max_y]\n", + "\n", + " ent_margin_withdrawal = {}\n", + " ent_margin_deposit = {}\n", + " for ent_3, bbox_3 in ent_margins.items():\n", + " if \"withdrawal\" in ent_3:\n", + " ent_margin_withdrawal[ent_3] = bbox_3\n", + " elif \"deposit\" in ent_3:\n", + " ent_margin_deposit[ent_3] = bbox_3\n", + " else:\n", + " ent_margin_withdrawal[ent_3] = bbox_3\n", + " ent_margin_deposit[ent_3] = bbox_3\n", + "\n", + " def get_x_region(\n", + " ent_margin_withdrawal: Dict[str, List[float]]\n", + " ) -> Dict[str, List[float]]:\n", + " \"\"\"\n", + " Calculates the x-regions for withdrawal entities.\n", + "\n", + " Args:\n", + " - ent_margin_withdrawal (Dict[str, List[float]]): A dictionary containing the margins for withdrawal entities.\n", + "\n", + " Returns:\n", + " - Dict[str, List[float]]: A dictionary containing the x-regions for withdrawal entities.\n", + " \"\"\"\n", + " sorted_ent_margin_withdrawal = sorted_data = dict(\n", + " sorted(ent_margin_withdrawal.items(), key=lambda x: x[1][0])\n", + " )\n", + " ent_x_regions = {}\n", + " keys_sorted = list(sorted_ent_margin_withdrawal.keys())\n", + " for n_1 in range(len(keys_sorted)):\n", + " if n_1 < len(keys_sorted) - 1:\n", + " if (\n", + " sorted_ent_margin_withdrawal[keys_sorted[n_1]][2]\n", + " > sorted_ent_margin_withdrawal[keys_sorted[n_1 + 1]][0]\n", + " ):\n", + " ent_x_regions[keys_sorted[n_1]] = [\n", + " sorted_ent_margin_withdrawal[keys_sorted[n_1]][0],\n", + " sorted_ent_margin_withdrawal[keys_sorted[n_1]][2],\n", + " ]\n", + " else:\n", + " ent_x_regions[keys_sorted[n_1]] = [\n", + " sorted_ent_margin_withdrawal[keys_sorted[n_1]][0],\n", + " sorted_ent_margin_withdrawal[keys_sorted[n_1 + 1]][0],\n", + " ]\n", + " else:\n", + " ent_x_regions[keys_sorted[n_1]] = [\n", + " sorted_ent_margin_withdrawal[keys_sorted[n_1]][0],\n", + " sorted_ent_margin_withdrawal[keys_sorted[n_1]][2],\n", + " ]\n", + "\n", + " return ent_x_regions\n", + "\n", + " withdrawal_x_region = get_x_region(ent_margin_withdrawal)\n", + " deposit_x_region = get_x_region(ent_margin_deposit)\n", + " ent_x_region = {**deposit_x_region, **withdrawal_x_region}\n", + "\n", + " return ent_x_region\n", + "\n", + "\n", + "def get_line_item_y_region(line_items: List[Dict[str, Any]]) -> List[float]:\n", + " \"\"\"\n", + " Computes the y-regions for line items based on their bounding box coordinates.\n", + "\n", + " Args:\n", + " - line_items (List[Dict[str, Any]]): A list of line items.\n", + "\n", + " Returns:\n", + " - List[float]: A list containing the y-regions for line items.\n", + " \"\"\"\n", + " line_item_y_region = []\n", + " max_y_line_item = []\n", + " for tab_item in line_items:\n", + " y_1 = []\n", + " for line_details in tab_item.page_anchor.page_refs:\n", + " page = line_details.page\n", + " for xy_1 in line_details.bounding_poly.normalized_vertices:\n", + " y_1.append(xy_1.y)\n", + " line_item_y_region.append(min(y_1))\n", + " max_y_line_item.append(max(y_1))\n", + "\n", + " line_item_y_region.append(max(max_y_line_item))\n", + " sorted_line_item_y_region = sorted(line_item_y_region)\n", + "\n", + " return sorted_line_item_y_region\n", + "\n", + "\n", + "def get_line_item_y_region_by_anchor(\n", + " line_items: List[Dict[str, Any]], anchor_entity: str\n", + ") -> List[float]:\n", + " \"\"\"\n", + " Computes the y-regions for line items based on their bounding box coordinates related to the anchor entity.\n", + "\n", + " Args:\n", + " - line_items (List[Dict[str, Any]]): A list of line items.\n", + " - anchor_entity (str): The anchor entity.\n", + "\n", + " Returns:\n", + " - List[float]: A list containing the y-regions for line items related to the anchor entity.\n", + " \"\"\"\n", + " y_max_anchor = []\n", + " y_min_anchor = []\n", + "\n", + " for tab1_item in line_items:\n", + " for child in tab1_item.properties:\n", + " if child.type_ == anchor_entity:\n", + " y_2 = []\n", + " for child_details in child.page_anchor.page_refs:\n", + " for xy_2 in child_details.bounding_poly.normalized_vertices:\n", + " y_2.append(xy_2.y)\n", + " y_max_anchor.append(max(y_2))\n", + " y_min_anchor.append(min(y_2))\n", + " sorted_y_max_anchor = sorted(y_max_anchor)\n", + " sorted_y_min_anchor = sorted(y_min_anchor)\n", + " sorted_y_max_anchor.append(sorted_y_min_anchor[0])\n", + " sorted_y_anchor = sorted(sorted_y_max_anchor)\n", + "\n", + " return sorted_y_anchor\n", + "\n", + "\n", + "def get_line_item_region(\n", + " schema_xy: List[Dict[str, List[List[float]]]],\n", + " anchor_entity: str,\n", + " line_items: List[Dict[str, Any]],\n", + ") -> Tuple[List[List[float]], List[Dict[str, Any]]]:\n", + " \"\"\"\n", + " Computes the regions for line items based on the schema and anchor entity.\n", + "\n", + " Args:\n", + " - schema_xy (List[Dict[str, List[List[float]]]]): A list of dictionaries representing the bounding box coordinates\n", + " of line items for each type.\n", + " - anchor_entity (str): The anchor entity.\n", + " - line_items (List[Dict[str, Any]]): A list of line items.\n", + "\n", + " Returns:\n", + " - Tuple[List[List[float]], List[Dict[str, Any]]]: A tuple containing the regions for line items and a list of child items.\n", + " \"\"\"\n", + " region_y = []\n", + " for reg in schema_xy:\n", + " for e4, v4 in reg.items():\n", + " if \"date\" in e4: # if e4==anchor_entity:\n", + " region_y.append(v4[0][1])\n", + " # Get line item total region and getting all child items into single list\n", + " bbox_line_y = []\n", + " bbox_line_x = []\n", + " child_items = []\n", + " for line_item in line_items:\n", + " bbox_line = get_page_bbox(line_item)\n", + " bbox_line_y.extend([bbox_line[1], bbox_line[3]])\n", + " bbox_line_x.extend([bbox_line[0], bbox_line[2]])\n", + " for child in line_item.properties:\n", + " child_items.append(child)\n", + " line_item_start_y = min(bbox_line_y)\n", + " line_item_end_y = max(bbox_line_y)\n", + "\n", + " # getting Boundry for each line item\n", + " line_item_region = []\n", + " region_y = sorted(region_y)\n", + " for r1 in range(len(region_y)):\n", + " if r1 == 0:\n", + " line_item_region.append([line_item_start_y, region_y[r1 + 1]])\n", + " elif r1 == len(region_y) - 1:\n", + " line_item_region.append([region_y[r1], line_item_end_y])\n", + " else:\n", + " line_item_region.append([region_y[r1], region_y[r1 + 1]])\n", + "\n", + " return line_item_region, child_items\n", + "\n", + "\n", + "def group_line_items(parent_type, child_items, page, line_item_region, json_dict):\n", + " \"\"\"\n", + " Groups child items into line items based on the provided parent type and line item regions.\n", + "\n", + " Args:\n", + " - parent_type (str): The type of the parent line item.\n", + " - child_items (List[Dict[str, Any]]): A list of child items.\n", + " - page (str): The page number.\n", + " - line_item_region (List[List[float]]): A list of line item regions.\n", + " - json_dict :documentai.Document\n", + "\n", + " Returns:\n", + " - List[Dict[str, Any]]: A list containing the grouped line items.\n", + " \"\"\"\n", + " grouped_line_items = []\n", + "\n", + " for boundry in line_item_region:\n", + " line_item_temp = {\n", + " \"mention_text\": \"\",\n", + " \"page_anchor\": {\n", + " \"page_refs\": [\n", + " {\"bounding_poly\": {\"normalized_vertices\": []}, \"page\": page}\n", + " ]\n", + " },\n", + " \"properties\": [],\n", + " \"text_anchor\": {\"text_segments\": []},\n", + " \"type\": parent_type,\n", + " }\n", + " text_anc_temp = []\n", + " page_anc_temp = {\"x\": [], \"y\": []}\n", + " mt_temp = \"\"\n", + " for child_1 in child_items:\n", + " bbox_temp = get_page_bbox(child_1)\n", + " if (\n", + " bbox_temp[1] >= boundry[0] - 0.005\n", + " and bbox_temp[3] <= boundry[1] + 0.005\n", + " ):\n", + " line_item_temp[\"properties\"].append(child_1)\n", + " page_anc_temp[\"x\"].extend([bbox_temp[0], bbox_temp[2]])\n", + " page_anc_temp[\"y\"].extend([bbox_temp[1], bbox_temp[3]])\n", + " seg_temp = child_1.text_anchor.text_segments\n", + " for seg in seg_temp:\n", + " text_anc_temp.append(\n", + " {\n", + " \"start_index\": str(seg.start_index),\n", + " \"end_index\": str(seg.end_index),\n", + " }\n", + " )\n", + " sorted_data = sorted(text_anc_temp, key=lambda x: int(x[\"end_index\"]))\n", + " for sort_text in sorted_data:\n", + " mt_temp = (\n", + " mt_temp\n", + " + \" \"\n", + " + json_dict.text[\n", + " int(sort_text[\"start_index\"]) : int(sort_text[\"end_index\"])\n", + " ]\n", + " )\n", + " line_item_temp[\"text_anchor\"][\"text_segments\"] = sorted_data\n", + " line_item_temp[\"mention_text\"] = mt_temp\n", + " line_item_temp[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"][\n", + " \"normalized_vertices\"\n", + " ] = [\n", + " {\"x\": min(page_anc_temp[\"x\"]), \"y\": min(page_anc_temp[\"y\"])},\n", + " {\"x\": max(page_anc_temp[\"x\"]), \"y\": min(page_anc_temp[\"y\"])},\n", + " {\"x\": max(page_anc_temp[\"x\"]), \"y\": max(page_anc_temp[\"y\"])},\n", + " {\"x\": min(page_anc_temp[\"x\"]), \"y\": max(page_anc_temp[\"y\"])},\n", + " ]\n", + " grouped_line_items.append(line_item_temp)\n", + "\n", + " return grouped_line_items\n", + "\n", + "\n", + "def get_updated_grouped_line_items(json_dict, parent_type):\n", + " \"\"\"\n", + " Groups child items into line items based on the specified parent type and line item regions.\n", + "\n", + " Args:\n", + " - parent_type (str): The type of the parent line item.\n", + " - child_items (List[Dict[str, Any]]): A list of child items.\n", + " - page (str): The page number.\n", + " - line_item_region (List[List[float]]): A list of line item regions.\n", + " - json_dict :documentai.Document.\n", + "\n", + " Returns:\n", + " documentai.Document.\n", + " \"\"\"\n", + " final_line_items = []\n", + " page_wise_ent = get_page_wise_entities(json_dict)\n", + " entities_ungrouped = []\n", + " other_entities = []\n", + " for page_num, ent in page_wise_ent.items():\n", + " try:\n", + " line_items = [\n", + " entity\n", + " for entity in ent\n", + " if entity.properties and entity.type == parent_type\n", + " ]\n", + " line_items_other = [\n", + " entity\n", + " for entity in ent\n", + " if entity.properties and entity.type != parent_type\n", + " ]\n", + " for other_ent in line_items_other:\n", + " other_entities.append(other_ent)\n", + " try:\n", + " line_item_schema, schema_xy = get_schema_with_bbox(line_items)\n", + " anchor_entity = get_anchor_entity(schema_xy, line_item_schema)\n", + " line_item_region, child_items = get_line_item_region(\n", + " schema_xy, anchor_entity, line_items\n", + " )\n", + " grouped_line_items = group_line_items(\n", + " parent_type, child_items, page_num, line_item_region, json_dict\n", + " )\n", + " for item in grouped_line_items:\n", + " final_line_items.append(item)\n", + " except:\n", + " entities_ungrouped.append(line_items)\n", + " continue\n", + " except:\n", + " continue\n", + " final_entities = []\n", + " for en3 in json_dict.entities:\n", + " if en3.type != parent_type:\n", + " final_entities.append(en3)\n", + " for lin_it in final_line_items:\n", + " final_entities.append(lin_it)\n", + " if len(entities_ungrouped) > 0:\n", + " for item_1 in entities_ungrouped:\n", + " for item_2 in item_1:\n", + " final_entities.append(item_2)\n", + "\n", + " json_dict.entities = final_entities\n", + "\n", + " return json_dict\n", + "\n", + "\n", + "def get_missing_data(\n", + " json_dict: documentai.Document, parent_type: str\n", + ") -> documentai.Document:\n", + " \"\"\"\n", + " Processes the JSON dictionary to handle missing data by adding new entities.\n", + "\n", + " Args:\n", + " - json_dict : documentai.Document.\n", + " - parent_type (str): The parent type of line items.\n", + "\n", + " Returns:\n", + " - Dict[str, Any]: The updated JSON dictionary with missing data handled.\n", + " \"\"\"\n", + " page_wise_ent = get_page_wise_entities(json_dict)\n", + " new_added_entities = []\n", + " other_entities = []\n", + " json_dict = get_updated_grouped_line_items(json_dict, parent_type)\n", + " for page_num, ent in page_wise_ent.items():\n", + " line_items = [\n", + " entity for entity in ent if entity.properties and entity.type == parent_type\n", + " ]\n", + " line_items_other = [\n", + " entity for entity in ent if entity.properties and entity.type != parent_type\n", + " ]\n", + " for other_ent in line_items_other:\n", + " other_entities.append(other_ent)\n", + " if len(line_items) > 2:\n", + " line_item_schema, schema_xy = get_schema_with_bbox(line_items)\n", + " ent_x_region = entity_region_x(schema_xy)\n", + " anchor_entity = get_anchor_entity(schema_xy, line_item_schema)\n", + " line_item_y_region = get_line_item_y_region(line_items)\n", + " temp_schema_dict, final_ent_x12, final_ent_y12 = get_line_items_schema(\n", + " line_items\n", + " )\n", + " new_ent = get_missing_fields(\n", + " json_dict,\n", + " line_items,\n", + " temp_schema_dict,\n", + " final_ent_x12,\n", + " ent_x_region,\n", + " line_item_y_region,\n", + " )\n", + " for item in new_ent:\n", + " new_added_entities.append(item)\n", + " else:\n", + " for lin_it1 in line_items:\n", + " other_entities.append(lin_it1)\n", + " final_entities = []\n", + " for en3 in json_dict.entities:\n", + " if en3.type != parent_type:\n", + " final_entities.append(en3)\n", + " for lin_it in new_added_entities:\n", + " final_entities.append(lin_it)\n", + " for lin_it2 in other_entities:\n", + " final_entities.append(lin_it2)\n", + " json_dict.entities = final_entities\n", + "\n", + " return json_dict\n", + "\n", + "\n", + "def main():\n", + " file_name_list, file_path_dict = file_names(Gcs_input_path)\n", + " for i in range(len(file_name_list)):\n", + " file_path = (\n", + " \"gs://\"\n", + " + Gcs_input_path.split(\"/\")[2]\n", + " + \"/\"\n", + " + file_path_dict[file_name_list[i]]\n", + " )\n", + " print(file_path)\n", + " json_data = documentai_json_proto_downloader(\n", + " file_path.split(\"/\")[2], (\"/\").join(file_path.split(\"/\")[3:])\n", + " )\n", + " if Missing_items_flag == \"True\":\n", + " json_data = get_missing_data(json_data, parent_type)\n", + " json_data = get_updated_grouped_line_items(json_data, parent_type)\n", + " store_document_as_json(\n", + " documentai.Document.to_json(json_data),\n", + " Gcs_output_path.split(\"/\")[2],\n", + " (\"/\").join(Gcs_output_path.split(\"/\")[3:]) + \"/\" + file_name_list[i],\n", + " )\n", + "\n", + "\n", + "main()" + ] + }, + { + "cell_type": "markdown", + "id": "9ae3a0ae-5931-46bb-88d8-1523688935b2", + "metadata": {}, + "source": [ + "## Output" + ] + }, + { + "cell_type": "markdown", + "id": "565fb0ae-b6c8-4336-b0e0-3d56a0916b17", + "metadata": {}, + "source": [ + "The missing fields will be detected from the existing line items and grouped and updated json is saved in ouput path" + ] + } + ], + "metadata": { + "environment": { + "kernel": "conda-root-py", + "name": "workbench-notebooks.m113", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/categorizing_bank_statement_transactions_by_account_number/README.md b/incubator-tools/categorizing_bank_statement_transactions_by_account_number/README.md new file mode 100644 index 000000000..5b20a964d --- /dev/null +++ b/incubator-tools/categorizing_bank_statement_transactions_by_account_number/README.md @@ -0,0 +1,61 @@ +# Purpose and Description + +This document guides to categorize the transactions for each account number from the bank statement parsed json. + +## Input Details + +* **gcs_input_path**: Input GCS path which contains bank statement parser JSON files. +* **gcs_output_path**: GCS path to store post processed(JSON) results. + +## Output Details + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Bank Statement parser output entity type Before post processingAfter post processing
account_numberaccount_0_number
account_1_number ..etc
account_typeaccount_0_name
account_1_name ..etc
starting_balanceaccount_0_beggining_balance
account_1_beggining_balance ..etc
ending_balanceaccount_0_ending_balance
account_1_ending_balance ..etc
table_item/transaction_deposit_dateaccount_0_transaction/deposit_date
account_1_transaction/deposit_date ..etc
table_item/transaction_deposit_descriptionaccount_0_transaction/deposit_description
account_1_transaction/deposit_description ..etc
table_item/transaction_depositaccount_0_transaction/deposit
account_1_transaction/deposit ..etc
table_item/transaction_withdrawal_dateaccount_0_transaction/withdrawal_date
account_1_transaction/withdrawal_date ..etc
table_item/transaction_withdrawal_descriptionaccount_0_transaction/withdrawal_description
account_1_transaction/withdrawal_description ..etc
table_item/transaction_withdrawalaccount_0_transaction/withdrawal
account_1_transaction/withdrawal ..etc
table_itemaccount_0_trasaction
account_1_transaction ..etc
diff --git a/incubator-tools/categorizing_bank_statement_transactions_by_account_number/categorizing_bank_statement_transactions_by_account_number.ipynb b/incubator-tools/categorizing_bank_statement_transactions_by_account_number/categorizing_bank_statement_transactions_by_account_number.ipynb new file mode 100644 index 000000000..1a248439b --- /dev/null +++ b/incubator-tools/categorizing_bank_statement_transactions_by_account_number/categorizing_bank_statement_transactions_by_account_number.ipynb @@ -0,0 +1,750 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Categorizing Bank Statement Transactions by Account Number" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Objective\n", + "This document guides to categorize the transactions for each account number from the bank statement parsed json.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prerequisite\n", + "* Python : Jupyter notebook (Vertex) \n", + "* GCS storage bucket\n", + "* Bank Statement Parser" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step by Step Procedure" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Import ModulesPpackages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install google-cloud-documentai --quiet\n", + "%pip install google-cloud-documentai-toolbox --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-01-12 12:45:47-- https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 29735 (29K) [text/plain]\n", + "Saving to: ‘utilities.py’\n", + "\n", + "utilities.py 100%[===================>] 29.04K --.-KB/s in 0.002s \n", + "\n", + "2024-01-12 12:45:47 (13.3 MB/s) - ‘utilities.py’ saved [29735/29735]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from collections import Counter, defaultdict\n", + "from difflib import SequenceMatcher\n", + "from typing import Dict, List, Union\n", + "\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "from google.cloud.documentai_toolbox import gcs_utilities\n", + "\n", + "import utilities" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Input Details" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`gcs_input_path`: Input GCS path which contains bank statement parser JSON files \n", + "`gcs_output_path`: GCS path to store post processed(JSON) results" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Bank statement parser jsons path\n", + "gcs_input_path = \"gs://bucket/path_to/pre/input\"\n", + "# post process json path\n", + "gcs_output_path = \"gs://bucket/path_to/post/output/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Run Below Code-cells" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Categorizing Bank Statement Transactions by Account Number Process Started...\n", + "\tFile: 1941000828-0.json\n", + "\t\tPost processed data uploaded to gs://siddamv/categorizing_bank_statement_transactions_by_account_number/post/output/1941000828-0.json\n", + "\tFile: 2016398000-0.json\n", + "\t\tPost processed data uploaded to gs://siddamv/categorizing_bank_statement_transactions_by_account_number/post/output/2016398000-0.json\n", + "\tFile: 2016654464-0.json\n", + "\t\tPost processed data uploaded to gs://siddamv/categorizing_bank_statement_transactions_by_account_number/post/output/2016654464-0.json\n", + "\tFile: 2017496199-0.json\n", + "\t\tPost processed data uploaded to gs://siddamv/categorizing_bank_statement_transactions_by_account_number/post/output/2017496199-0.json\n", + "\tFile: 2024616717-0.json\n", + "\t\tPost processed data uploaded to gs://siddamv/categorizing_bank_statement_transactions_by_account_number/post/output/2024616717-0.json\n", + "\tFile: SampleBank-0.json\n", + "\t\tPost processed data uploaded to gs://siddamv/categorizing_bank_statement_transactions_by_account_number/post/output/SampleBank-0.json\n", + "Process Completed\n" + ] + } + ], + "source": [ + "def del_ent_attrs(ent: documentai.Document.Entity) -> None:\n", + " \"\"\"To delete empty attributes of Entity object\n", + "\n", + " Args:\n", + " ent (documentai.Document.Entity): DocumentAI doc-proto object\n", + " \"\"\"\n", + "\n", + " if not ent.normalized_value:\n", + " del ent.normalized_value\n", + " if not ent.confidence:\n", + " del ent.confidence\n", + " if not ent.page_anchor:\n", + " del ent.page_anchor\n", + " if not ent.id:\n", + " del ent.id\n", + " if not ent.mention_text:\n", + " del ent.mention_text\n", + " if not ent.text_anchor:\n", + " del ent.text_anchor\n", + "\n", + "\n", + "def boundary_markers(doc: documentai.Document) -> documentai.Document:\n", + " \"\"\"It will rename all entities & child_entities type_\n", + "\n", + " Args:\n", + " doc (documentai.Document): DocumentAI Doc-Proto object\n", + "\n", + " Returns:\n", + " documentai.Document: It returns DocumentAI Doc-Proto object with new entity-type\n", + " \"\"\"\n", + "\n", + " # find ent_ids of Json\n", + " ent_ids = defaultdict(list)\n", + " all_entities = []\n", + " for idx, entity in enumerate(doc.entities):\n", + " if entity.id:\n", + " ent_ids[idx].append(int(entity.id))\n", + " all_entities.append(entity)\n", + " for prop in entity.properties:\n", + " ent_ids[idx].append(int(prop.id))\n", + " all_entities.append(prop)\n", + " all_entities = sorted(all_entities, key=lambda x: x.id)\n", + " # Single Level Entities file : json_dict\n", + " json_dict = defaultdict(list)\n", + " for entity in all_entities:\n", + " json_dict[\"confidence\"].append(entity.confidence)\n", + " json_dict[\"id\"].append(entity.id)\n", + " json_dict[\"mentionText\"].append(entity.mention_text)\n", + " json_dict[\"normalizedValue\"].append(entity.normalized_value)\n", + " json_dict[\"pageAnchor\"].append(entity.page_anchor)\n", + " json_dict[\"textAnchor\"].append(entity.text_anchor)\n", + " json_dict[\"type\"].append(entity.type_)\n", + "\n", + " acc_dict = {}\n", + " idx = 0\n", + " for ent in doc.entities:\n", + " if ent.type_ != \"account_number\":\n", + " continue\n", + " pg_no = ent.page_anchor.page_refs[0].page\n", + " y_min = min(\n", + " vertex.y\n", + " for vertex in ent.page_anchor.page_refs[0].bounding_poly.normalized_vertices\n", + " )\n", + " acn = re.sub(\"\\D\", \"\", ent.mention_text.strip(\".#:' \"))\n", + " acc_dict[idx] = {\"page\": pg_no, \"account_number\": acn, \"min_y\": y_min}\n", + " idx += 1\n", + "\n", + " sorted_data = sorted(acc_dict.values(), key=lambda x: (int(x[\"page\"]), x[\"min_y\"]))\n", + " # acns -> acns\n", + " acns = {}\n", + " idx = 0\n", + " for data in sorted_data:\n", + " acn = data[\"account_number\"]\n", + " if acn not in acns and len(acn) > 6:\n", + " acns[acn] = f\"account_{idx}_number\"\n", + " idx += 1\n", + " acn_dict = {}\n", + " acn_page_dict = {}\n", + " for key, value in acns.items():\n", + " si_ei_pn = []\n", + " pg_nos = set()\n", + " zip_data = zip(\n", + " json_dict[\"mentionText\"], json_dict[\"pageAnchor\"], json_dict[\"textAnchor\"]\n", + " )\n", + " for mt, pa, ta in zip_data:\n", + " if re.sub(\"\\D\", \"\", mt.strip(\".#:' \")) == key:\n", + " page = pa.page_refs[0].page\n", + " ts = ta.text_segments[0]\n", + " si_ei_pn.append((ts.start_index, ts.end_index, page))\n", + " pg_nos.add(page)\n", + " acn_page_dict[value] = pg_nos\n", + " acn_dict[value] = si_ei_pn\n", + "\n", + " page_no = set(range(len(doc.pages)))\n", + " pages_temp = set()\n", + " for pn_set in acn_page_dict.values():\n", + " page_no = page_no & pn_set\n", + " if page_no:\n", + " pages_temp = page_no\n", + " page_no = list(pages_temp)\n", + " for value in acn_dict.values():\n", + " value.sort(key=lambda x: x[2])\n", + "\n", + " acns_to_delete = []\n", + " for key, value in acn_dict.items():\n", + " if key != \"account_0_number\":\n", + " min_si = min_ei = min_page = float(\"inf\")\n", + " data_to_rm = []\n", + " if len(value) <= 1:\n", + " acns_to_delete.append(key)\n", + " continue\n", + " for si_ei_pn in value:\n", + " check_length = len(data_to_rm) < len(value) - 1\n", + " check_if = (si_ei_pn[2] in page_no) and (si_ei_pn[2] < 3)\n", + " if check_if and check_length:\n", + " data_to_rm.append(si_ei_pn)\n", + " continue\n", + " min_si = min(min_si, si_ei_pn[0])\n", + " min_ei = min(min_ei, si_ei_pn[1])\n", + " min_page = min(min_page, si_ei_pn[2])\n", + " data_to_rm.append(si_ei_pn)\n", + " for k in data_to_rm:\n", + " value.remove(k)\n", + " acn_dict[key] = [(min_si, min_ei, min_page)]\n", + " continue\n", + " min_si = min_ei = 0\n", + " min_page = float(\"inf\")\n", + " data_to_rm = []\n", + " for si_ei_pn in value:\n", + " if si_ei_pn[2] != page_no[0]:\n", + " continue\n", + " min_si = max(min_si, si_ei_pn[0])\n", + " min_ei = max(min_ei, si_ei_pn[1])\n", + " min_page = min(min_page, si_ei_pn[2])\n", + " data_to_rm.append(si_ei_pn)\n", + "\n", + " for k in data_to_rm:\n", + " value.remove(k)\n", + " acn_dict[\"account_0_number\"] = [(min_si, min_ei, min_page)]\n", + "\n", + " for i in acns_to_delete:\n", + " del acn_dict[i]\n", + "\n", + " txt_len = len(doc.text)\n", + " if len(acns) > 1:\n", + " border_idx = []\n", + " for si_ei_pn in acn_dict.values():\n", + " border_idx.append((si_ei_pn[0][0], si_ei_pn[0][1]))\n", + "\n", + " region_splitter = []\n", + " for bi in border_idx:\n", + " region_splitter.append(bi[0])\n", + "\n", + " region_splitter_dict = {}\n", + " for idx, rs in enumerate(region_splitter):\n", + " region_splitter_dict[rs] = f\"account_{idx}\"\n", + " region_splitter_dict[txt_len] = \"last_index\"\n", + " else:\n", + " region_splitter_dict = dict([(txt_len, \"account_0\")])\n", + " region_splitter_dict[txt_len + 1] = \"last_index\"\n", + "\n", + " for i, _ in enumerate(json_dict[\"id\"]):\n", + " sub_str = re.sub(\"\\D\", \"\", json_dict[\"mentionText\"][i].strip(\".#:' \"))\n", + " ent_type = json_dict[\"type\"][i]\n", + " if ent_type == \"account_number\" and len(sub_str) > 5:\n", + " json_dict[\"type\"][i] = acns[sub_str]\n", + "\n", + " TYPE_MAPPING = {\n", + " \"starting_balance\": \"_beginning_balance\",\n", + " \"ending_balance\": \"_ending_balance\",\n", + " \"table_item/transaction_deposit_date\": \"_transaction/deposit_date\",\n", + " \"table_item/transaction_deposit_description\": \"_transaction/deposit_desc\",\n", + " \"table_item/transaction_deposit\": \"_transaction/deposit_amount\",\n", + " \"table_item/transaction_withdrawal_date\": \"_transaction/withdraw_date\",\n", + " \"table_item/transaction_withdrawal_description\": \"_transaction/withdraw_desc\",\n", + " \"table_item/transaction_withdrawal\": \"_transaction/withdraw_amount\",\n", + " }\n", + " for i, _id in enumerate(json_dict[\"id\"]):\n", + " try:\n", + " si = json_dict[\"textAnchor\"][i].text_segments[0].start_index\n", + " except IndexError:\n", + " # To skip entity type checking if there is no TextAnchor object in Doc Proto\n", + " continue\n", + " ent_type = json_dict[\"type\"][i]\n", + " keys = list(region_splitter_dict.keys())\n", + " for j in range(1, len(region_splitter_dict)):\n", + " if ent_type in TYPE_MAPPING and si < keys[j]:\n", + " json_dict[\"type\"][i] = (\n", + " region_splitter_dict[keys[j - 1]] + TYPE_MAPPING[ent_type]\n", + " )\n", + " break\n", + "\n", + " new_entities = []\n", + " for i, _ in enumerate(all_entities):\n", + " entity = documentai.Document.Entity(\n", + " confidence=json_dict[\"confidence\"][i],\n", + " id=json_dict[\"id\"][i],\n", + " mention_text=json_dict[\"mentionText\"][i],\n", + " normalized_value=json_dict[\"normalizedValue\"][i],\n", + " page_anchor=json_dict[\"pageAnchor\"][i],\n", + " text_anchor=json_dict[\"textAnchor\"][i],\n", + " type_=json_dict[\"type\"][i],\n", + " )\n", + " new_entities.append(entity)\n", + " new_entities_to_id_dict = {}\n", + " for ent in new_entities:\n", + " new_entities_to_id_dict[int(ent.id)] = ent\n", + " all_entities_new = [\"\"] * len(ent_ids)\n", + " for i, _ids in ent_ids.items():\n", + " if len(_ids) == 1:\n", + " all_entities_new[i] = new_entities_to_id_dict[_ids[0]]\n", + " continue\n", + " sub_entities = []\n", + " for _id in _ids:\n", + " sub_entities.append(new_entities_to_id_dict[_id])\n", + " all_entities_new[i] = doc.entities[i]\n", + " all_entities_new[i].properties = sub_entities\n", + " for ent in all_entities_new:\n", + " del_ent_attrs(ent)\n", + " for child_ent in ent.properties:\n", + " del_ent_attrs(child_ent)\n", + " for i in all_entities_new:\n", + " if i.type_ == \"table_item\":\n", + " i.type_ = i.properties[0].type_.split(\"/\")[0]\n", + "\n", + " doc.entities = all_entities_new\n", + " return doc\n", + "\n", + "\n", + "def match_ent_type(doc: documentai.Document, ent_type: str) -> Dict[str, str]:\n", + " \"\"\"It will look for provided `ent_type` with all entities in doc-proto object & clean its matched mention_text\n", + "\n", + " Args:\n", + " doc (documentai.Document): DocumentAI doc-proto object\n", + " ent_type (str): A string-data to look in all entities\n", + "\n", + " Returns:\n", + " Dict[str, str]: All matched entity-types with provided `ent_type` as key and Its most-frequent mention_text as value\n", + " \"\"\"\n", + "\n", + " types = set()\n", + " for entity in doc.entities:\n", + " if ent_type in entity.type_:\n", + " types.add(entity.type_)\n", + " types_dict = {}\n", + " for unique_type in types:\n", + " cleaned_mts = []\n", + " for entity in doc.entities:\n", + " if unique_type == entity.type_:\n", + " cleaned_mts.append(entity.mention_text.strip(\"$#\"))\n", + " data = Counter(cleaned_mts).most_common(1)[0][0]\n", + " types_dict[unique_type] = data\n", + " return types_dict\n", + "\n", + "\n", + "def fix_account_balance(doc: documentai.Document) -> documentai.Document:\n", + " \"\"\"It will fix account balance for doc-proto entities whose entity-types matches with `beginning_balance` or `ending_balance`\n", + "\n", + " Args:\n", + " doc (documentai.Document): DocumentAI doc-proto object\n", + "\n", + " Returns:\n", + " documentai.Document: It returns updated DocumentAI Doc-Proto object\n", + " \"\"\"\n", + "\n", + " beg_end_dict = dict()\n", + " beg_end_dict.update(match_ent_type(doc, \"beginning_balance\"))\n", + " beg_end_dict.update(match_ent_type(doc, \"ending_balance\"))\n", + " for entity in doc.entities:\n", + " mt = entity.mention_text.strip(\"$#\")\n", + " et = entity.type_\n", + " keys = list(beg_end_dict.keys())\n", + " values = list(beg_end_dict.values())\n", + " if et in beg_end_dict:\n", + " if mt != beg_end_dict[et] and mt in values:\n", + " entity.type_ = keys[values.index(mt)]\n", + " elif mt != beg_end_dict[et]:\n", + " doc.entities.remove(entity)\n", + " return doc\n", + "\n", + "\n", + "def find_account_number(\n", + " data: List[Dict[str, Union[int, float]]], page_no: int, y_coord: float\n", + ") -> Union[None, str]:\n", + " \"\"\"It will look for nearest account_number in provided page number based on y_coord\n", + "\n", + " Args:\n", + " data (List[Dict[str, Union[int, float]]]): It contains account-numbers and its corresponding page_no & y-coordinate\n", + " page_no (int): Page number to look for account-number\n", + " y_coord (float): minimum y-coordinate of token which matches with r\"\\sstatement\"\n", + "\n", + " Returns:\n", + " Union[None,str]: It returns either None or closest account number from given `page_no`\n", + " \"\"\"\n", + " closest_acc = None\n", + " min_dst = float(\"inf\")\n", + " for acn, page_info_list in data.items():\n", + " for page_info in page_info_list:\n", + " page = page_info.get(\"page\")\n", + " y = page_info.get(\"y\")\n", + " dst = abs(y_coord - y)\n", + " if page == page_no and dst < min_dst:\n", + " min_dst = dst\n", + " closest_acc = acn\n", + " return closest_acc\n", + "\n", + "\n", + "def detials_account(\n", + " doc: documentai.Document, account_type: str\n", + ") -> List[\n", + " Dict[\n", + " str,\n", + " Dict[str, Union[str, int, documentai.Document.TextAnchor.TextSegment, float]],\n", + " ]\n", + "]:\n", + " \"\"\"It will look for entities whose type_ matches with `account_type`\n", + "\n", + " Args:\n", + " doc (documentai.Document): DocumentAI doc-proto object\n", + " account_type (str): String data to match with individual entity.type_\n", + "\n", + " Returns:\n", + " List[Dict[str,Dict[str, Union[str,int,documentai.Document.TextAnchor.TextSegment, float]]]]:\n", + " it returnsList which has dictionary of mention_text and its id, page_number, text_segment, x_max & y_max\n", + " \"\"\"\n", + " acc_dict_lst = []\n", + " for ent in doc.entities:\n", + " if ent.properties:\n", + " continue\n", + " match_ratio = SequenceMatcher(None, ent.type_, account_type).ratio()\n", + " if match_ratio >= 0.9:\n", + " id1 = ent.id\n", + " page1 = ent.page_anchor.page_refs[0].page\n", + " text_segment = ent.text_anchor.text_segments[0]\n", + " x_coords = []\n", + " y_coords = []\n", + " nvs = ent.page_anchor.page_refs[0].bounding_poly.normalized_vertices\n", + " for nv in nvs:\n", + " x_coords.append(nv.x)\n", + " y_coords.append(nv.y)\n", + " x_max = max(x_coords, default=\"\")\n", + " y_max = max(y_coords, default=\"\")\n", + " acc_dict_lst.append(\n", + " {\n", + " ent.mention_text: {\n", + " \"id\": id1,\n", + " \"page\": page1,\n", + " \"textSegments\": text_segment,\n", + " \"x_max\": x_max,\n", + " \"y_max\": y_max,\n", + " }\n", + " }\n", + " )\n", + " return acc_dict_lst\n", + "\n", + "\n", + "def accounttype_change(doc: documentai.Document) -> documentai.Document:\n", + " \"\"\"It will rename entity type_ for all target entities in doc-proto object\n", + "\n", + " Args:\n", + " doc (documentai.Document): DocumentAI doc-proto object\n", + "\n", + " Returns:\n", + " documentai.Document: It returns updated doc-proto object\n", + " \"\"\"\n", + "\n", + " acc_name_dict = detials_account(doc, \"account_type\")\n", + " acn_dict = detials_account(doc, \"account_i_number\")\n", + " temp_del = []\n", + " for item in acc_name_dict:\n", + " for key in item:\n", + " if re.search(\"\\sstatement\", key, re.IGNORECASE):\n", + " temp_del.append(key)\n", + " for idx, item in enumerate(acc_name_dict):\n", + " for key in item:\n", + " for m in temp_del:\n", + " if key == m:\n", + " del acc_name_dict[idx]\n", + " acc_comp = []\n", + " for name_item in acc_name_dict:\n", + " for acn_item in acn_dict:\n", + " for key, value in name_item.items():\n", + " for acn, value_2 in acn_item.items():\n", + " y_diff = abs(value[\"y_max\"] - value_2[\"y_max\"])\n", + " acc_comp.append({key: {acn: y_diff}})\n", + "\n", + " ymin_dict = {}\n", + " for entry in acc_comp:\n", + " for acc_type, account_info in entry.items():\n", + " # acn -> account_number\n", + " for acn, miny in account_info.items():\n", + " if acn in ymin_dict:\n", + " curr_min = ymin_dict[acn][\"min_value\"]\n", + " if miny < curr_min:\n", + " ymin_dict[acn] = {\"account_type\": acc_type, \"min_value\": miny}\n", + " else:\n", + " ymin_dict[acn] = {\"account_type\": acc_type, \"min_value\": miny}\n", + "\n", + " # Extract one account name based on min y\n", + " result_dict = {acn: data[\"account_type\"] for acn, data in ymin_dict.items()}\n", + " acn_ymin = {}\n", + " map_acc_type = {}\n", + " for ent in doc.entities:\n", + " match_ratio = SequenceMatcher(None, ent.type_, \"account_i_number\").ratio()\n", + " if match_ratio > 0.8:\n", + " acc_num1 = re.sub(\"\\D\", \"\", ent.mention_text.strip(\".#:' \"))\n", + " if len(acc_num1) > 5:\n", + " nvs = ent.page_anchor.page_refs[0].bounding_poly.normalized_vertices\n", + " min_y1 = min(nv.y for nv in nvs)\n", + " page = ent.page_anchor.page_refs[0].page\n", + " if acc_num1 in acn_ymin.keys():\n", + " acn_ymin[acc_num1].append({\"y\": min_y1, \"page\": page})\n", + " else:\n", + " acn_ymin[acc_num1] = [{\"y\": min_y1, \"page\": page}]\n", + " cond1 = ent.mention_text in result_dict.keys()\n", + " cond2 = ent.mention_text not in map_acc_type.keys()\n", + " if cond1 and cond2:\n", + " map_acc_type[ent.mention_text] = ent.type_\n", + "\n", + " for ent in doc.entities:\n", + " cond1 = ent.type_ == \"account_type\"\n", + " cond2 = re.search(\"\\sstatement\", ent.mention_text, re.IGNORECASE)\n", + " if cond1 and cond2:\n", + " doc.entities.remove(ent)\n", + " elif cond1:\n", + " nvs = ent.page_anchor.page_refs[0].bounding_poly.normalized_vertices\n", + " ymin_2 = min(nv.y for nv in nvs)\n", + " page = ent.page_anchor.page_refs[0].page\n", + " x1 = find_account_number(acn_ymin, page, ymin_2)\n", + " try:\n", + " data = map_acc_type[x1].split(\"_\")[1]\n", + " except KeyError:\n", + " continue\n", + " else:\n", + " ent.type_ = f\"account_{data}_name\"\n", + " return doc\n", + "\n", + "\n", + "input_bucket, _ = gcs_utilities.split_gcs_uri(gcs_input_path)\n", + "output_bucket, output_files_dir = gcs_utilities.split_gcs_uri(gcs_output_path)\n", + "_, file_dict = utilities.file_names(gcs_input_path)\n", + "print(f\"Categorizing Bank Statement Transactions by Account Number Process Started...\")\n", + "for fn, fp in file_dict.items():\n", + " print(f\"\\tFile: {fn}\")\n", + " doc = utilities.documentai_json_proto_downloader(input_bucket, fp)\n", + " try:\n", + " doc = boundary_markers(doc)\n", + " except Exception as e:\n", + " doc = doc\n", + " print(\"Unable to update the account details because of {}\".format(e.args))\n", + " try:\n", + " doc = fix_account_balance(doc)\n", + " except Exception as e:\n", + " print(\n", + " \"Unable to update the starting and ending balance because of {}\".format(\n", + " e.args\n", + " )\n", + " )\n", + " try:\n", + " doc = accounttype_change(doc)\n", + " except Exception as e:\n", + " print(\"Unable to update the account type because of {}\".format(e).args)\n", + " str_data = documentai.Document.to_json(\n", + " doc,\n", + " use_integers_for_enums=False,\n", + " including_default_value_fields=False,\n", + " preserving_proto_field_name=False,\n", + " )\n", + " output_file_path = f\"{output_files_dir.rstrip('/')}/{fn}\"\n", + " target_path = output_file_path if output_files_dir else fn\n", + " utilities.store_document_as_json(str_data, output_bucket, target_path)\n", + " print(f\"\\t\\tPost processed data uploaded to gs://{output_bucket}/{target_path}\")\n", + "print(f\"Process Completed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Output Details" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The bank statement parser entities for transactions will be mapped relating to the account \n", + "Mapping as below \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Bank Statement parser output entity type Before post processingAfter post processing
account_numberaccount_0_number
account_1_number ..etc
account_typeaccount_0_name
account_1_name ..etc
starting_balanceaccount_0_beggining_balance
account_1_beggining_balance ..etc
ending_balanceaccount_0_ending_balance
account_1_ending_balance ..etc
table_item/transaction_deposit_dateaccount_0_transaction/deposit_date
account_1_transaction/deposit_date ..etc
table_item/transaction_deposit_descriptionaccount_0_transaction/deposit_description
account_1_transaction/deposit_description ..etc
table_item/transaction_depositaccount_0_transaction/deposit
account_1_transaction/deposit ..etc
table_item/transaction_withdrawal_dateaccount_0_transaction/withdrawal_date
account_1_transaction/withdrawal_date ..etc
table_item/transaction_withdrawal_descriptionaccount_0_transaction/withdrawal_description
account_1_transaction/withdrawal_description ..etc
table_item/transaction_withdrawalaccount_0_transaction/withdrawal
account_1_transaction/withdrawal ..etc
table_itemaccount_0_trasaction
account_1_transaction ..etc
\n" + ] + } + ], + "metadata": { + "environment": { + "kernel": "conda-root-py", + "name": "workbench-notebooks.m113", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/incubator-tools/docai_pdf_clustering_analysis_tool/README.md b/incubator-tools/docai_pdf_clustering_analysis_tool/README.md new file mode 100644 index 000000000..04a165d66 --- /dev/null +++ b/incubator-tools/docai_pdf_clustering_analysis_tool/README.md @@ -0,0 +1,22 @@ +# DocAI PDF Clustering Analysis Tool + +## Objective + +The tool is designed to perform advanced image analysis and clustering on PDF documents. +It utilizes the VGG16 deep learning model to extract and process image features from PDF pages, +applies PCA for dimensionality reduction, and employs K-Means clustering to categorize the images into distinct groups. +The tool aims to facilitate efficient organization and analysis of visual data contained in large sets of PDF files. + +## Practical Application +This tool was created to aid in extracting tables from documents with varied layouts, responding to a +customer's need to handle hundreds of uniquely formatted documents efficiently. By using clustering +analysis, it helps in categorizing documents to facilitate easier management and analysis. This enables +users to better understand their document variations and streamline the extraction process, making it +highly beneficial for those looking to efficiently manage and analyze a large volume of PDF documents. + + +## Clustering Analysis Output + +cc2.png + +cc3.png diff --git a/incubator-tools/docai_pdf_clustering_analysis_tool/docai_pdf_clustering_analysis_tool.ipynb b/incubator-tools/docai_pdf_clustering_analysis_tool/docai_pdf_clustering_analysis_tool.ipynb new file mode 100644 index 000000000..3252bb06e --- /dev/null +++ b/incubator-tools/docai_pdf_clustering_analysis_tool/docai_pdf_clustering_analysis_tool.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fe5922c2-06c3-4885-982e-e3b424749aad", + "metadata": {}, + "source": [ + "# DocAI PDF Clustering Analysis Tool" + ] + }, + { + "cell_type": "markdown", + "id": "baae6c60-a35c-4bfd-b68e-f5cd5725b7ea", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "32abd819-f532-4006-a479-cffb02d3428a", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "de84c621-d6ea-4e41-b0b0-f1d3afdd72e6", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "The tool is designed to perform advanced image analysis and clustering on PDF documents. It utilizes the VGG16 deep learning model to extract and process image features from PDF pages, applies PCA for dimensionality reduction, and employs K-Means clustering to categorize the images into distinct groups. The tool aims to facilitate efficient organization and analysis of visual data contained in large sets of PDF files.\n", + "\n", + "## Practical Application\n", + "This tool was created to aid in extracting tables from documents with varied layouts, responding to a customer's need to handle hundreds of uniquely formatted documents efficiently. By using clustering analysis, it helps in categorizing documents to facilitate easier management and analysis. This enables users to better understand their document variations and streamline the extraction process, making it highly beneficial for those looking to efficiently manage and analyze a large volume of PDF documents." + ] + }, + { + "cell_type": "markdown", + "id": "5d41e857-4d73-4b78-9413-d697d0a01dc8", + "metadata": {}, + "source": [ + "## Step by step procedure" + ] + }, + { + "cell_type": "markdown", + "id": "6ee0d0b5-8d03-4008-a46b-6fb38e99e35a", + "metadata": {}, + "source": [ + "### Install the required libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceb3423b-e4af-4c25-ba76-a7da65bf98a7", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install PyPDF2 Pillow numpy scikit-learn matplotlib tensorflow pdf2image pandas" + ] + }, + { + "cell_type": "markdown", + "id": "c7444227-86db-4a7b-a1f4-3b140c02325b", + "metadata": {}, + "source": [ + "### Import the required libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c24b1097-4cf9-44a4-93b1-d4e06b4c9f6d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import PyPDF2\n", + "from PIL import Image\n", + "import numpy as np\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.cluster import KMeans\n", + "import matplotlib.pyplot as plt\n", + "from keras.applications.vgg16 import VGG16, preprocess_input\n", + "from tensorflow.keras.preprocessing.image import img_to_array\n", + "from pdf2image import convert_from_path\n", + "from typing import List\n", + "from collections import defaultdict\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "be76a220-9f4b-4a3b-ae59-460239f937a4", + "metadata": { + "tags": [] + }, + "source": [ + "### Setup the required inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0b0e71a-588a-45cc-95da-55968792eb44", + "metadata": {}, + "outputs": [], + "source": [ + "# Directory containing PDFs for the cluster analysis\n", + "pdf_dir = \"/home/jupyter/\"" + ] + }, + { + "cell_type": "markdown", + "id": "26aa0ac8-024f-4879-95ce-f7ffa17060ce", + "metadata": {}, + "source": [ + "### Run the Code" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8ac856a2-ec0a-4bde-ac4a-229de8713edd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1/1 [==============================] - 0s 225ms/step\n", + "1/1 [==============================] - 0s 106ms/step\n", + "1/1 [==============================] - 0s 108ms/step\n", + "1/1 [==============================] - 0s 105ms/step\n", + "1/1 [==============================] - 0s 107ms/step\n", + "1/1 [==============================] - 0s 107ms/step\n", + "1/1 [==============================] - 0s 104ms/step\n", + "1/1 [==============================] - 0s 106ms/step\n", + "1/1 [==============================] - 0s 108ms/step\n", + "1/1 [==============================] - 0s 104ms/step\n", + "1/1 [==============================] - 0s 106ms/step\n", + "1/1 [==============================] - 0s 108ms/step\n", + "1/1 [==============================] - 0s 109ms/step\n", + "1/1 [==============================] - 0s 109ms/step\n", + "1/1 [==============================] - 0s 106ms/step\n", + "1/1 [==============================] - 0s 353ms/step\n", + "1/1 [==============================] - 0s 368ms/step\n", + "1/1 [==============================] - 0s 398ms/step\n", + "1/1 [==============================] - 0s 105ms/step\n", + "1/1 [==============================] - 0s 104ms/step\n", + "1/1 [==============================] - 0s 102ms/step\n", + "1/1 [==============================] - 0s 106ms/step\n", + "1/1 [==============================] - 0s 104ms/step\n", + "1/1 [==============================] - 0s 104ms/step\n", + "1/1 [==============================] - 0s 105ms/step\n", + "1/1 [==============================] - 0s 104ms/step\n", + "1/1 [==============================] - 0s 108ms/step\n", + "1/1 [==============================] - 0s 101ms/step\n" + ] + } + ], + "source": [ + "def convert_pdf_to_images(pdf_path: str) -> List[Image.Image]:\n", + " \"\"\"\n", + " Convert each page of a PDF file into images.\n", + "\n", + " Args:\n", + " pdf_path (str): The file path of the PDF.\n", + "\n", + " Returns:\n", + " List[Image.Image]: A list of PIL Image objects, one for each page of the PDF.\n", + " \"\"\"\n", + " return convert_from_path(pdf_path)\n", + "\n", + "\n", + "def extract_images_from_pdf(pdf_path: str) -> List[Image.Image]:\n", + " \"\"\"\n", + " Extract embedded images from a PDF file.\n", + "\n", + " Args:\n", + " pdf_path (str): The file path of the PDF.\n", + "\n", + " Returns:\n", + " List[Image.Image]: A list of extracted PIL Image objects.\n", + " \"\"\"\n", + " images = []\n", + " with open(pdf_path, \"rb\") as file:\n", + " pdf = PyPDF2.PdfReader(file)\n", + " for page in pdf.pages:\n", + " if \"/XObject\" in page[\"/Resources\"]:\n", + " xObject = page[\"/Resources\"][\"/XObject\"].getObject()\n", + " for obj in xObject:\n", + " if xObject[obj][\"/Subtype\"] == \"/Image\":\n", + " size = (xObject[obj][\"/Width\"], xObject[obj][\"/Height\"])\n", + " data = xObject[obj]._data\n", + " if xObject[obj][\"/ColorSpace\"] == \"/DeviceRGB\":\n", + " mode = \"RGB\"\n", + " else:\n", + " mode = \"P\"\n", + " image = Image.frombytes(mode, size, data)\n", + " images.append(image)\n", + " return images\n", + "\n", + "\n", + "def preprocess_images(images: List[Image.Image]) -> np.ndarray:\n", + " \"\"\"\n", + " Preprocess a list of images for VGG16 model.\n", + "\n", + " Args:\n", + " images (List[Image.Image]): A list of PIL Image objects.\n", + "\n", + " Returns:\n", + " np.ndarray: A numpy array of processed images suitable for VGG16 model input.\n", + " \"\"\"\n", + " processed_images = []\n", + " for image in images:\n", + " image = image.resize((224, 224))\n", + " image = img_to_array(image)\n", + " image = np.expand_dims(image, axis=0)\n", + " image = preprocess_input(image)\n", + " processed_images.append(image)\n", + "\n", + " if processed_images:\n", + " return np.vstack(processed_images)\n", + " else:\n", + " return np.array(processed_images) # Return an empty numpy array if no images\n", + "\n", + "\n", + "# Load VGG16 model\n", + "model = VGG16(include_top=False, weights=\"imagenet\", pooling=\"avg\")\n", + "\n", + "# Process each PDF\n", + "features = []\n", + "file_mappings = [] # To map features to filenames\n", + "\n", + "for pdf_file in os.listdir(pdf_dir):\n", + " if pdf_file.endswith(\".pdf\"):\n", + " pdf_path = os.path.join(pdf_dir, pdf_file)\n", + " images = convert_pdf_to_images(pdf_path) # or extract_images_from_pdf(pdf_path)\n", + " processed_images = preprocess_images(images)\n", + "\n", + " if processed_images.size > 0:\n", + " pdf_features = model.predict(processed_images)\n", + " features.append(pdf_features)\n", + " file_mappings.extend([(pdf_file, feature) for feature in pdf_features])\n", + "\n", + "# Combine features from all PDFs\n", + "features = np.vstack([feature for _, feature in file_mappings])\n", + "\n", + "# Apply PCA\n", + "pca = PCA(n_components=0.9) # Adjust the number of components as needed\n", + "pca_features = pca.fit_transform(features)" + ] + }, + { + "cell_type": "markdown", + "id": "d30dcd01-cbe0-4f35-9ef8-1d523e243474", + "metadata": {}, + "source": [ + "## Execute the following code to utilize the Elbow method for determining the ideal number of clusters" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3f96748a-1151-48a2-b80f-6b1903d25093", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABTUAAAK9CAYAAAD1zYiuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAACJ0ElEQVR4nOzdd3RU1frG8WeSkFATeglEqvReBKQXCUURURBRmhRREKmXokIgKl0FEREUwYICKkoRkA5KKIJIkV4ExVBNQg2QnN8f5zczGUJJQpIzk3w/a83KKXtm3pmEu+597t7vthmGYQgAAAAAAAAAPISX1QUAAAAAAAAAQGIQagIAAAAAAADwKISaAAAAAAAAADwKoSYAAAAAAAAAj0KoCQAAAAAAAMCjEGoCAAAAAAAA8CiEmgAAAAAAAAA8CqEmAAAAAAAAAI9CqAkAAAAAAADAoxBqAgCAVLV+/XrZbDZ9++23Vpfi0LBhQzVs2NBx7o41uouuXbsqa9asqfJeNptNISEhD/w6ISEhstlsOn/+/IMX9QBOnDghm82mOXPmWFrH7YoUKaLHH3/c6jKSVdeuXVWkSBG3fu+0+L0DAJCaCDUBAMADs9lsCXqsX78+1WqyB0h3e4wbNy7VakluXbt2lc1mk7+/v65duxbv/uHDhx2fc9KkSYl+/atXryokJCRVf19IWX/++adCQkJ04sQJq0tJNqdPn1ZISIh27dpldSkAAMACPlYXAAAAPN8XX3zhcv75559r1apV8a6XKVNG+/fvT83S9Nxzz6lly5bxrlepUiVV60huPj4+unr1qpYsWaL27du73Pvqq6+UMWNGXb9+PUmvffXqVY0ePVqSXGaw4sEULlxY165dU4YMGVL9vf/880+NHj1aDRs2tGwGY3I7ffq0Ro8erSJFiqhy5cou92bNmqXY2FhrCgMAAKmCUBMAADywF154weV8y5YtWrVqVbzrklI91Kxateod6/B0fn5+qlOnjr7++ut4oea8efPUqlUrfffddxZVhzux2WzKmDGj1WWkC1YExwAAIHWx/BwAAFgiNjZWb7/9tgoVKqSMGTOqSZMmOnLkSLxxW7duVfPmzRUQEKDMmTOrQYMG+vXXX1OlxpiYGI0YMUL58+dXlixZ1Lp1a506dSreuIULF6patWrKlCmTcufOrRdeeEH//POP4/7ixYtls9m0e/dux7XvvvtONptNbdu2dXmtMmXK6Nlnn01QfR07dtTy5csVERHhuLZ9+3YdPnxYHTt2vONzIiIi1L9/fwUFBcnPz08lSpTQ+PHjHbPaTpw4oTx58kiSRo8e7VjGfntvy3/++Udt2rRR1qxZlSdPHg0ePFgxMTEuY65cuaJBgwY53qtUqVKaNGmSDMNwGRcdHa0BAwYoT548ypYtm1q3bq2///47Qd+BJH3wwQcqV66cMmfOrBw5cqh69eqaN2/eHT97165dlT17dgUEBKhbt266evWqy5hbt24pNDRUxYsXl5+fn4oUKaIRI0YoOjraMWbgwIHKlSuXy+d49dVXZbPZNHXqVMe1M2fOyGaz6aOPPnJ8t7f31LT3KE3I93nhwgV16tRJ/v7+yp49u7p06aI//vjjvn0658yZo3bt2kmSGjVqdNd2EL/88oseeeQRZcyYUcWKFdPnn39+x+/wXn8/9zN9+nSVK1dOfn5+CgwMVJ8+fVz+fiVzdnD58uW1Y8cOPfroo8qUKZOKFi2qGTNmOMasX79eNWrUkCR169bN8Zns38PtfS3t3/2kSZP04YcfqlixYsqcObOaNWumU6dOyTAMhYaGqlChQsqUKZOefPJJXbx40aWuH3/8Ua1atVJgYKD8/PxUvHhxhYaGxvs9PYi5c+fKx8dHQ4YMSbbXBAAgrSLUBAAAlhg3bpwWLVqkwYMHa/jw4dqyZYuef/55lzFr165V/fr1FRUVpVGjRumdd95RRESEGjdurG3btiXofa5evarz58/He9y6deu+z3377be1bNkyDR06VP369dOqVavUtGlTlz6Wc+bMUfv27eXt7a2xY8eqZ8+e+v7771W3bl1HWFO3bl3ZbDZt3LjR8bxNmzbJy8tLv/zyi+PauXPndODAAdWvXz9Bn61t27ay2Wz6/vvvHdfmzZun0qVLq2rVqnf8Lho0aKAvv/xSnTt31tSpU1WnTh0NHz5cAwcOlCTlyZPHEcI99dRT+uKLL/TFF1+4hK8xMTEKDg5Wrly5NGnSJDVo0ECTJ0/WzJkzHWMMw1Dr1q313nvvqXnz5nr33XdVqlQpDRkyxPFedj169ND777+vZs2aady4ccqQIYNatWqVoO9g1qxZ6tevn8qWLav3339fo0ePVuXKlbV169Z4Y9u3b69Lly5p7Nixat++vebMmeNYZh+3lpEjR6pq1ap677331KBBA40dO1YdOnRwjKlXr54uXryoffv2Oa7Zf5+bNm1yuSbpvr/PhHyfsbGxeuKJJ/T111+rS5cuevvtt/Xvv/+qS5cu9/2O6tevr379+kmSRowY4fidlilTxjHmyJEjeuaZZ/TYY49p8uTJypEjh7p27eryGRPy93MvISEh6tOnjwIDAzV58mQ9/fTT+vjjj9WsWTPdvHnTZex///2nli1bqlq1apowYYIKFSqkl19+WbNnz5Zkhv9jxoyRJPXq1cvxme73XX/11VeaPn26Xn31VQ0aNEgbNmxQ+/bt9cYbb2jFihUaOnSoevXqpSVLlmjw4MEuz50zZ46yZs2qgQMHasqUKapWrZpGjhypYcOG3fezJ8TMmTPVrVs3DRs2TBMnTkyW1wQAIE0zAAAAklmfPn2Mu/3XjHXr1hmSjDJlyhjR0dGO61OmTDEkGXv27DEMwzBiY2ONhx9+2AgODjZiY2Md465evWoULVrUeOyxx+5Zw/Hjxw1Jd32EhYU5xjZo0MBo0KBBvBoLFixoREVFOa4vWLDAkGRMmTLFMAzDuHHjhpE3b16jfPnyxrVr1xzjli5dakgyRo4c6bhWrlw5o3379o7zqlWrGu3atTMkGfv37zcMwzC+//57Q5Lxxx9/3POzdenSxciSJYthGIbxzDPPGE2aNDEMwzBiYmKM/PnzG6NHj3Z8/okTJzqeFxoaamTJksU4dOiQy+sNGzbM8Pb2Nk6ePGkYhmGcO3fOkGSMGjXqju8tyRgzZozL9SpVqhjVqlVznP/www+GJOOtt95yGffMM88YNpvNOHLkiGEYhrFr1y5DkvHKK6+4jOvYseNda4jrySefNMqVK3fPMaNGjTIkGS+++KLL9aeeesrIlSuX49xeS48ePVzGDR482JBkrF271jAMwzh79qwhyZg+fbphGIYRERFheHl5Ge3atTPy5cvneF6/fv2MnDlzOv5+7b+Tzz77zDEmod/nd999Z0gy3n//fce1mJgYo3HjxvFe804WLlxoSDLWrVsX717hwoUNScbGjRsd186ePWv4+fkZgwYNclxL6N/PnZw9e9bw9fU1mjVrZsTExDiuT5s2zZBkzJ4923GtQYMGhiRj8uTJjmvR0dFG5cqVjbx58xo3btwwDMMwtm/fftfP3qVLF6Nw4cKOc/t3nydPHiMiIsJxffjw4YYko1KlSsbNmzcd15977jnD19fXuH79uuPa1atX473PSy+9ZGTOnNll3O3vfTeFCxc2WrVqZRiG+Z9/NpvNCA0Nve/zAACAiZmaAADAEt26dZOvr6/jvF69epKkY8eOSZJ27drlWEZ94cIFxwzLK1euqEmTJtq4cWOClrz26tVLq1ativcoW7bsfZ/buXNnZcuWzXH+zDPPqECBAvrpp58kSb/99pvOnj2rV155xaVXYqtWrVS6dGktW7bM5fPZZ+5dunRJf/zxh3r16qXcuXM7rm/atEnZs2dX+fLl71ubXceOHbV+/XqFh4dr7dq1Cg8Pv+vS84ULF6pevXrKkSOHy6zVpk2bKiYmxmUm6f307t3b5bxevXqO350k/fTTT/L29nbMELQbNGiQDMPQ8uXLHeMkxRvXv3//BNWRPXt2/f3339q+fXuSar5w4YKioqJcarl91uGgQYMkyfH7zJMnj0qXLu34vn799Vd5e3tryJAhOnPmjA4fPizJ/H3aZ+kmpba43+eKFSuUIUMG9ezZ03HNy8tLffr0ue9rJ0TZsmUd/wYl8zOWKlXKpYYH+ftZvXq1bty4of79+8vLy/k/QXr27Cl/f3+XfyuSuRHWSy+95Dj39fXVSy+9pLNnz2rHjh1J/pzt2rVTQECA47xmzZqSzL7APj4+Ltdv3Ljh0kYiU6ZMjuNLly7p/Pnzqlevnq5evaoDBw4kuaYJEybotdde0/jx4/XGG28k+XUAAEhv2CgIAABY4qGHHnI5z5EjhyRz2akkRzB0r+W1kZGRjufdzcMPP6ymTZsmqcaHH37Y5dxms6lEiRI6ceKEJOmvv/6SJJUqVSrec0uXLu2ytLxevXqaMWOGjhw5oqNHj8pms6l27dqOsLNnz57atGmT6tSp4xL63E/Lli2VLVs2zZ8/X7t27VKNGjVcaozr8OHD2r17t6Nn5u3Onj2boPfMmDFjvNfIkSOH43cnmd9NYGCgSygsybHk2f7d/fXXX/Ly8lLx4sVdxt3pO72ToUOHavXq1XrkkUdUokQJNWvWTB07dlSdOnXijb3X35y/v7+jlhIlSriMy58/v7Jnz+6oWTJ/n/YQdNOmTapevbqqV6+unDlzatOmTcqXL5/++OOPuwbMcSX0+yxQoIAyZ87sMu72WpPq9u/mTjU8yN/P3f6t+Pr6qlixYi7frSQFBgYqS5YsLtdKliwpyeyPWatWrXt8mru7/XPaA86goKA7Xo/7+fft26c33nhDa9eudQThdpGRkUmqZ8OGDY4WF/TRBAAgcQg1AQCAJby9ve943fj/zVfsszAnTpyoypUr33Fs1qxZU6S2lFC3bl1J0saNG3Xs2DFVrVpVWbJkUb169TR16lRdvnxZv//+u95+++1Eva6fn5/atm2ruXPn6tixY/E29IkrNjZWjz32mP73v//d8b49NLqfu/3urFCmTBkdPHhQS5cu1YoVK/Tdd99p+vTpGjlyZLx+mff7m7NLyMzKunXratasWTp27Jg2bdqkevXqyWazqW7dutq0aZMCAwMVGxvrMvvxbtzh+0zId5Ncfz9WutvnvN/nj4iIUIMGDeTv768xY8aoePHiypgxo3bu3KmhQ4cmeKOk25UrV04RERH64osv9NJLL6lo0aJJeh0AANIjQk0AAOCW7DP3/P39kzzT8kHZZ4vaGYahI0eOqGLFipKkwoULS5IOHjyoxo0bu4w9ePCg475kzhB76KGHtGnTJh07dswRdtWvX18DBw7UwoULFRMTk+BNguLq2LGjZs+eLS8vL5cNbW5XvHhxXb58+b7fZ0JCvfspXLiwVq9erUuXLrnM1rQv07V/N4ULF1ZsbKyOHj3qMovv4MGDCX6vLFmy6Nlnn9Wzzz6rGzduqG3btnr77bc1fPhwl7YACak5NjZWhw8fdtlE58yZM4qIiHD5fdp/f6tWrdL27dsdm8XUr19fH330kWOmYbVq1RL8/verbd26dbp69arLbM0jR44k6PnJ8TtN6N/PncT9t1KsWDHH9Rs3buj48ePxXvP06dO6cuWKy2zNQ4cOSZJjV/Pk+EwJtX79el24cEHff/+9y7/R48ePP9Dr5s6dW99++63q1q2rJk2a6JdfflFgYOCDlgsAQLpAT00AAOCWqlWrpuLFi2vSpEm6fPlyvPvnzp1L8Ro+//xzXbp0yXH+7bff6t9//1WLFi0kSdWrV1fevHk1Y8YMRUdHO8YtX75c+/fvj7eDd7169bR27Vpt27bNEYpVrlxZ2bJl07hx45QpU6YkhWCNGjVSaGiopk2bpvz58991XPv27RUWFqaVK1fGuxcREeHYEd4emtl3b0+Kli1bKiYmRtOmTXO5/t5778lmszm+Q/vPqVOnuox7//33E/Q+Fy5ccDn39fVV2bJlZRhGvB21E1Lznd773XfflSSX32fRokVVsGBBvffee7p586ZjuXu9evV09OhRffvtt6pVq5ZLn8YHERwcrJs3b2rWrFmOa7Gxsfrwww8T9Hx7OPggv9OE/v3cSdOmTeXr66upU6e6zP789NNPFRkZGe/fyq1bt/Txxx87zm/cuKGPP/5YefLkcfwbSY7PlFD2mZxxa79x44amT5/+wK9dqFAhrV69WteuXdNjjz0W728aAADcGTM1AQCAW/Ly8tInn3yiFi1aqFy5curWrZsKFiyof/75R+vWrZO/v7+WLFly39fZuXOnvvzyy3jXixcvrtq1a9/zuTlz5lTdunXVrVs3nTlzRu+//75KlCjh2KwlQ4YMGj9+vLp166YGDRroueee05kzZzRlyhQVKVJEAwYMcHm9evXq6auvvnIsU5bMsOTRRx/VypUr1bBhQ5fNkxLKy8srQRuMDBkyRIsXL9bjjz+url27qlq1arpy5Yr27Nmjb7/9VidOnFDu3LmVKVMmlS1bVvPnz1fJkiWVM2dOlS9fPlEbGD3xxBNq1KiRXn/9dZ04cUKVKlXSzz//rB9//FH9+/d3zMStXLmynnvuOU2fPl2RkZF69NFHtWbNmgTPQGzWrJny58+vOnXqKF++fNq/f7+mTZumVq1axevneT+VKlVSly5dNHPmTMdy423btmnu3Llq06aNGjVq5DK+Xr16+uabb1ShQgVHf057W4FDhw4lqJ9mQrVp00aPPPKIBg0apCNHjqh06dJavHixLl68KOn+sxYrV64sb29vjR8/XpGRkfLz81Pjxo2VN2/eBNeQ0L+fO8mTJ4+GDx+u0aNHq3nz5mrdurUOHjyo6dOnq0aNGnrhhRdcxgcGBmr8+PE6ceKESpYs6egZO3PmTGXIkEGS+W84e/bsmjFjhrJly6YsWbKoZs2aKbKE+9FHH1WOHDnUpUsX9evXTzabTV988UW81gVJVaJECf38889q2LChgoODtXbtWvn7+yfLawMAkFYRagIAALfVsGFDhYWFOWYhXr58Wfnz51fNmjVddka+l6+//lpff/11vOtdunS5b6g5YsQI7d69W2PHjtWlS5fUpEkTTZ8+3WX5b9euXZU5c2aNGzdOQ4cOVZYsWfTUU09p/Pjxyp49u8vr2Wdnli5dWrly5XK5vnLlygT1X3wQmTNn1oYNG/TOO+9o4cKF+vzzz+Xv76+SJUtq9OjRLrtCf/LJJ3r11Vc1YMAA3bhxQ6NGjUpUqOnl5aXFixdr5MiRmj9/vj777DMVKVJEEydOdOwmbjd79mzlyZNHX331lX744Qc1btxYy5Yti7d5y5289NJL+uqrr/Tuu+/q8uXLKlSokPr165fkXaQ/+eQTFStWTHPmzNGiRYuUP39+DR8+XKNGjYo31h5q2gNqydy1u3bt2lq9enWy/j69vb21bNkyvfbaa5o7d668vLz01FNPadSoUapTp859l9nnz59fM2bM0NixY9W9e3fFxMRo3bp1iQo1E/P3cychISHKkyePpk2bpgEDBihnzpzq1auX3nnnHUdQaZcjRw7NnTtXr776qmbNmqV8+fJp2rRpLru/Z8iQQXPnztXw4cPVu3dv3bp1S5999lmKhJq5cuXS0qVLNWjQIL3xxhvKkSOHXnjhBTVp0kTBwcHJ8h4VKlTQ8uXL1bRpUz3xxBNasWKFy47rAADAlc1Irv97EQAAAECq+uGHH/TUU0/pl19+ueOO756oYcOGOn/+vPbu3Wt1KQAAwI3RUxMAAADwANeuXXM5j4mJ0QcffCB/f39VrVrVoqoAAACswfJzAAAAwAO8+uqrunbtmmrXrq3o6Gh9//332rx5s9555x2WKQMAgHSHUBMAAADwAI0bN9bkyZO1dOlSXb9+XSVKlNAHH3ygvn37Wl0aAABAqqOnJgAAAAAAAACPQk9NAAAAAAAAAB6FUBMAAAAAAACAR6GnZjKJjY3V6dOnlS1bNtlsNqvLAQAAAAAAADyKYRi6dOmSAgMD5eV177mYhJrJ5PTp0woKCrK6DAAAAAAAAMCjnTp1SoUKFbrnGELNZJItWzZJ5pfu7+9vcTUAAAAAAACAZ4mKilJQUJAjZ7sXQs1kYl9y7u/vT6gJAAAAAAAAJFFCWjuyURAAAAAAAAAAj0KoCQAAAAAAAMCjEGoCAAAAAAAA8CiEmgAAAAAAAAA8CqEmAAAAAAAAAI9CqAkAAAAAAADAoxBqAgAAAAAAAPAohJoAAAAAAAAAPAqhJgAAAAAAAACPQqgJAAAAAAAAwKMQagIAAAAAAADwKISaAAAAAAAAADwKoSYAAAAAAAAAj0KoCQAAAAAAAMCjEGoCAAAAAAAA8CiEmgAAAAAAAAA8CqEmAAAAAAAAAI9CqAkAAAAAAADAoxBqAgAAAAAAAPAohJoAAAAAAAAAPAqhJgAAAAAAAACPQqiJewoJkUJD73wvNNS8DwAAAAAAAKQmQk3ck7e3NHJk/GAzNNS87u1tTV0AAAAAAABIv3ysLgDu7c03zZ8jR0rr1kk//ii9/755PmaM8z4AAAAAAACQWmyGYRhWF5EWREVFKSAgQJGRkfL397e6nGRXtqy0f7/k5SXFxhJoAgAAAAAAIHklJl9j+TkS5NNPzZ+xsZKPD4EmAAAAAAAArEOoiQRZvdp5fOuW1KePdbUAAAAAAAAgfSPUxH3ZNwUaPVpq1cq8Nn269MYb1tYFAAAAAACA9ImNgnBP9kDT3kPzwgWpcmXp77+lt9+WfH3N+wAAAAAAAEBqYaYm7ikmxnVToFy5pPnzJW9v83z7dutqAwAAAAAAQPrE7ufJJK3vfn67CROkoUOljBmlrVulihWtrggAAAAAAACejN3PkeIGD5ZatpSuX5fatZMuXbK6IgAAAAAAAKQXhJpIEi8vae5cqWBB6dAh6eWXJeb8AgAAAAAAIDUQaiLJcueWvvnG7K/51VfS7NlWVwQAAAAAAID0gFATD6RuXXMXdEnq21fas8faegAAAAAAAJD2EWrigQ0ZIrVo4eyvefmy1RUBAAAAAAAgLSPUxAOL21/z4EH6awIAAAAAACBlEWoiWeTJ4+yv+eWX0mefWV0RAAAAAAAA0ipCTSSbunWl0FDzuG9fae9ea+sBAAAAAABA2kSoiWQ1dKgUHCxdu0Z/TQAAAAAAAKQMQk0kKy8v6YsvpMBA6cAB6ZVX6K8JAAAAAACA5EWoiWSXJ4/09dfOgHPOHKsrAgAAAAAAQFpCqIkUUb++s79mnz7Svn3W1gMAAAAAAIC0w9JQc+zYsapRo4ayZcumvHnzqk2bNjp48KDLmIYNG8pms7k8evfu7TLm5MmTatWqlTJnzqy8efNqyJAhunXrlsuY9evXq2rVqvLz81OJEiU05w7TBz/88EMVKVJEGTNmVM2aNbVt27Zk/8zpybBhUrNmzv6aV65YXREAAAAAAADSAktDzQ0bNqhPnz7asmWLVq1apZs3b6pZs2a6clv61bNnT/3777+Ox4QJExz3YmJi1KpVK924cUObN2/W3LlzNWfOHI0cOdIx5vjx42rVqpUaNWqkXbt2qX///urRo4dWrlzpGDN//nwNHDhQo0aN0s6dO1WpUiUFBwfr7NmzKf9FpFH25ecFCkj795szNgEAAAAAAIAHZTMM99nG5dy5c8qbN682bNig+vXrSzJnalauXFnvv//+HZ+zfPlyPf744zp9+rTy5csnSZoxY4aGDh2qc+fOydfXV0OHDtWyZcu0d+9ex/M6dOigiIgIrVixQpJUs2ZN1ahRQ9OmTZMkxcbGKigoSK+++qqGDRt239qjoqIUEBCgyMhI+fv7P8jXkOZs2CA1bizFxkqffSZ17Wp1RQAAAAAAAHA3icnX3KqnZmRkpCQpZ86cLte/+uor5c6dW+XLl9fw4cN19epVx72wsDBVqFDBEWhKUnBwsKKiorTv/xs5hoWFqWnTpi6vGRwcrLCwMEnSjRs3tGPHDpcxXl5eatq0qWPM7aKjoxUVFeXywJ01aCCNGWMev/IK/TUBAAAAAADwYNwm1IyNjVX//v1Vp04dlS9f3nG9Y8eO+vLLL7Vu3ToNHz5cX3zxhV544QXH/fDwcJdAU5LjPDw8/J5joqKidO3aNZ0/f14xMTF3HGN/jduNHTtWAQEBjkdQUFDSP3w6MHy49NhjZn/N9u3prwkAAAAAAICk87G6ALs+ffpo7969+uWXX1yu9+rVy3FcoUIFFShQQE2aNNHRo0dVvHjx1C7TYfjw4Ro4cKDjPCoqimDzHry8pC+/lCpXlv78U3r1VWn2bKurAgAAAAAAgCdyi5maffv21dKlS7Vu3ToVKlTonmNr1qwpSTpy5IgkKX/+/Dpz5ozLGPt5/vz57znG399fmTJlUu7cueXt7X3HMfbXuJ2fn5/8/f1dHri3vHmlefPMgPOzz6S5c62uCAAAAAAAAJ7I0lDTMAz17dtXixYt0tq1a1W0aNH7PmfXrl2SpAIFCkiSateurT179rjsUr5q1Sr5+/urbNmyjjFr1qxxeZ1Vq1apdu3akiRfX19Vq1bNZUxsbKzWrFnjGIPk0bChFBJiHr/yijlrEwAAAAAAAEgMS0PNPn366Msvv9S8efOULVs2hYeHKzw8XNeuXZMkHT16VKGhodqxY4dOnDihxYsXq3Pnzqpfv74qVqwoSWrWrJnKli2rTp066Y8//tDKlSv1xhtvqE+fPvLz85Mk9e7dW8eOHdP//vc/HThwQNOnT9eCBQs0YMAARy0DBw7UrFmzNHfuXO3fv18vv/yyrly5om7duqX+F5PGjRghNW0qXb1q9teMs+8TAAAAAAAAcF82wzAMy97cZrvj9c8++0xdu3bVqVOn9MILL2jv3r26cuWKgoKC9NRTT+mNN95wWe79119/6eWXX9b69euVJUsWdenSRePGjZOPj7Nl6Pr16zVgwAD9+eefKlSokN5880117drV5X2nTZumiRMnKjw8XJUrV9bUqVMdy93vJzFbzkM6c8bsrxkeLr34ovTpp1ZXBAAAAAAAACslJl+zNNRMSwg1E2/dOnPGZmys9PnnUqdOVlcEAAAAAAAAqyQmX3OLjYKQPjVqJI0aZR737i3t329tPQAAAAAAAPAMhJqw1OuvS02a0F8TAAAAAAAACUeoCUt5e0tffSXlyyft3Sv162d1RQAAAAAAAHB3hJqwXL580rx5ks1mbhj05ZdWVwQAAAAAAAB3RqgJt9C4sWt/zQMHrK0HAAAAAAAA7otQE27jjTfMcPPKFbO/5rVrVlcEAAAAAAAAd0SoCbcRt7/mnj3Sa69ZXREAAAAAAADcEaEm3Er+/GawabNJs2aZxwAAAAAAAEBchJpwO02aSG++aR6/9JJ08KC19QAAAAAAAMC9EGrCLY0cKTVsaPbXbNeO/poAAAAAAABwItSEW/L2lubNk/LmNftr9u9vdUUAAAAAAABwF4SacFsFCjj7a86caYacAAAAAAAAAKEm3FrTptIbb5jHL70kHTpkbT0AAAAAAACwHqEm3N6oUWZ/zcuXpfbt6a8JAAAAAACQ3hFqwu15e5vL0PPkkf74QxowwOqKAAAAAAAAYCVCTXiEwEDpyy/N/poffyx9843VFQEAAAAAAMAqhJrwGM2aSa+/bh737CkdPmxtPQAAAAAAALAGoSY8yqhRUv36Zn/Ndu2k69etrggAAAAAAACpjVATHsXHR/r6a/prAgAAAAAApGeEmvA4cftrzpghzZ9vdUUAAAAAAABITYSa8EjNmknDh5vH9NcEAAAAAABIXwg14bFGj5bq1ZMuXZLat6e/JgAAAAAAQHpBqAmPZe+vmTu3tGuXNGiQ1RUBAAAAAAAgNRBqwqMVLCh98YV5PH26tGCBtfUAAAAAAAAg5RFqwuM1b+7sr9mjh3TkiLX1AAAAAAAAIGURaiJNGDNGqluX/poAAAAAAADpAaEm0gR7f81cuaTff5cGD7a6IgAAAAAAAKQUQk2kGYUKOftrfvih9O231tYDAAAAAACAlEGoiTSlRQtp2DDzuHt36ehRa+sBAAAAAABA8iPURJoTGirVqSNFRZn9NaOjra4IAAAAAAAAyYlQE2mOj4/0zTdmf82dO+mvCQAAAAAAkNYQaiJNKlRI+vxz83jaNOm776ytBwAAAAAAAMmHUBNpVsuW0tCh5vGLL0rHjllbDwAAAAAAAJIHoSbStNBQ6dFH6a8JAAAAAACQlhBqIk3LkMHsr5kzp7Rjh/S//1ldEQAAAAAAAB4UoSbSvKAgZ3/NqVOl77+3th4AAAAAAAA8GEJNpAutWklDhpjH9NcEAAAAAADwbISaSDfefluqXVuKjJQ6dJBu3LC6IgAAAAAAACQFoSbSjbj9Nbdvp78mAAAAAACApyLURLry0EPS3Lnm8ZQp0qJF1tYDAAAAAACAxCPURLrz+OPS4MHm8YsvSsePW1sPAAAAAAAAEodQE+nSO+9ItWpJERHSs8/SXxMAAAAAAMCTEGoiXcqQQZo/X8qRw+yvOXSo1RUBAAAAAAAgoQg1kW7F7a/5/vvSjz9aWg4AAAAAAAASiFAT6doTT0iDBpnHXbtKJ05YWQ0AAAAAAAASglAT6d7YsVLNmvTXBAAAAAAA8BSEmkj37P01s2eXtm2Thg+3uiIAAAAAAADcC6EmIKlwYWd/zXffpb8mAAAAAACAOyPUBP5f69bSgAHmcdeu0l9/WVoOAAAAAAAA7oJQE4hj3DjpkUforwkAAAAAAODOCDWBOHx9nf01t26VRoywuiIAAAAAAADcjlATuE2RItJnn5nHkydLS5ZYWg4AAAAAAABuQ6gJ3EGbNlL//uZxly7SyZNWVgMAAAAAAIC4CDWBuxg/XqpRQ/rvP7O/5s2bVlcEAAAAAAAAiVATuCt7f82AAGnLFvprAgAAAAAAuAtCTeAeihZ19tecNElautTaegAAAAAAAECoCdzXU09Jr71mHtNfEwAAAAAAwHqEmkACTJggVa8uXbwodehAf00AAAAAAAArEWoCCRC3v2ZYmPT661ZXBAAAAAAAkH4RagIJVKyYNHu2eTxxorRsmbX1AAAAAAAApFeEmkAitG0rvfqqedy5s3TqlLX1AAAAAAAApEeEmkAiTZwoVatGf00AAAAAAACrEGoCieTnJy1YIPn7S5s3S2++aXVFAAAAAAAA6QuhJpAEcftrjh8v/fSTtfUAAAAAAACkJ4SaQBI9/bTUt6953Lmz9Pff1tYDAAAAAACQXhBqAg9g0iSpalXpwgWzv+atW1ZXBAAAAAAAkPYRagIPIG5/zV9/pb8mAAAAAABAaiDUBB5Q8eLSJ5+Yx+PGScuXW1sPAAAAAABAWkeoCSSDdu2kPn3MY/prAgAAAAAApCxCTSCZTJokVakinT8vPfcc/TUBAAAAAABSCqEmkEwyZjT7a2bLJv3yizRypNUVAQAAAAAApE2EmkAyKlHC2V9z7Fhp5Upr6wEAAAAAAEiLCDWBZNa+vfTyy+bxCy9I//xjbT0AAAAAAABpDaEmkALefVeqXJn+mgAAAAAAACmBUBNIAXH7a27aJIWEWF0RAAAAAABA2kGoCaSQhx+WZs0yj995h/6aAAAAAAAAyYVQE0hBzz4r9e4tGYbUqZN0+rTVFQEAAAAAAHg+Qk0ghb33nlSpknTunNSxI/01AQAAAAAAHhShJpDCMmaUFi6UsmaVNmyQRo+2uiIAAAAAAADPRqgJpIKHH5ZmzjSP335bWrXK2noAAAAAAAA8GaEmkEqee0566SWzv+bzz9NfEwAAAAAAIKkINYFU9N57UsWK9NcEAAAAAAB4EISaQCrKlElasMDZX3PMGKsrAgAAAAAA8DyEmkAqK1XK2V/zrbek1autrQcAAAAAAMDTEGoCFnjuOalnT2d/zX//tboiAAAAAAAAz0GoCVhkyhSpQgXp7Fkz2IyJsboiAAAAAAAAz0CoCVgkUyZp4UIpSxZp3TopNNTqigAAAAAAADwDoSZgoVKlpI8/No/HjJHWrLG2HgAAAAAAAE9AqAlY7PnnpR49nP01w8OtrggAAAAAAMC9EWoCbmDqVLO/5pkz9NcEAAAAAAC4H0JNwA1kyiQtWGD211y7VnrrLasrAgAAAAAAcF+EmoCbKF1amjHDPB492gw3AQAAAAAAEB+hJuBGXnhB6t7d7K/ZsSP9NQEAAAAAAO6EUBNwM1OnSuXLm/01X3iB/poAAAAAAAC3I9QE3EzmzGZ/zcyZpTVrpLfftroiAAAAAAAA90KoCbihMmWkjz4yj0ePltats7YeAAAAAAAAd0KoCbipzp2lbt2k2Fizv+aZM1ZXBAAAAAAA4B4INQE3Nm2aVK6cuWEQ/TUBAAAAAABMhJqAG4vbX3P1amnsWKsrAgAAAAAAsB6hJuDmypaVpk83j0eNktavt7QcAAAAAAAAyxFqAh6gSxepa1dnf82zZ62uCAAAAAAAwDqEmoCHmDbNnLX5779mf83YWKsrAgAAAAAAsAahJuAhsmQx+2tmyiStWkV/TQAAAAAAkH4RagIepFw5Z3/NkSOlDRusrQcAAAAAAMAKhJqAh+na1eyxSX9NAAAAAACQXhFqAh7oww+lMmWk06elTp3orwkAAAAAANIXQk3AA8Xtr/nzz9K4cVZXBAAAAAAAkHosDTXHjh2rGjVqKFu2bMqbN6/atGmjgwcPuoy5fv26+vTpo1y5cilr1qx6+umndebMGZcxJ0+eVKtWrZQ5c2blzZtXQ4YM0a1bt1zGrF+/XlWrVpWfn59KlCihOXPmxKvnww8/VJEiRZQxY0bVrFlT27ZtS/bPDCSX8uXNGZuS9Oab0qZN1tYDAAAAAACQWiwNNTds2KA+ffpoy5YtWrVqlW7evKlmzZrpypUrjjEDBgzQkiVLtHDhQm3YsEGnT59W27ZtHfdjYmLUqlUr3bhxQ5s3b9bcuXM1Z84cjRw50jHm+PHjatWqlRo1aqRdu3apf//+6tGjh1auXOkYM3/+fA0cOFCjRo3Szp07ValSJQUHB+ssDQvhxrp2dS4/79BBOnfO6ooAAAAAAABSns0wDMPqIuzOnTunvHnzasOGDapfv74iIyOVJ08ezZs3T88884wk6cCBAypTpozCwsJUq1YtLV++XI8//rhOnz6tfPnySZJmzJihoUOH6ty5c/L19dXQoUO1bNky7d271/FeHTp0UEREhFasWCFJqlmzpmrUqKFp06ZJkmJjYxUUFKRXX31Vw4YNu2/tUVFRCggIUGRkpPz9/ZP7qwHu6vJlqUYN6cABKThY+uknyYvGEgAAAAAAwMMkJl9zq+gjMjJSkpQzZ05J0o4dO3Tz5k01bdrUMaZ06dJ66KGHFBYWJkkKCwtThQoVHIGmJAUHBysqKkr79u1zjIn7GvYx9te4ceOGduzY4TLGy8tLTZs2dYy5XXR0tKKiolwegBWyZpUWLjT7a65cKY0fb3VFAAAAAAAAKcttQs3Y2Fj1799fderUUfny5SVJ4eHh8vX1Vfbs2V3G5suXT+Hh4Y4xcQNN+337vXuNiYqK0rVr13T+/HnFxMTccYz9NW43duxYBQQEOB5BQUFJ++BAMihfXvrgA/OY/poAAAAAACCtc5tQs0+fPtq7d6+++eYbq0tJkOHDhysyMtLxOHXqlNUlIZ178UXphRekmBjpueforwkAAAAAANIutwg1+/btq6VLl2rdunUqVKiQ43r+/Pl148YNRUREuIw/c+aM8ufP7xhz+27o9vP7jfH391emTJmUO3dueXt733GM/TVu5+fnJ39/f5cHYCWbTfroI6lUKemff6TOnc0NhAAAAAAAANIaS0NNwzDUt29fLVq0SGvXrlXRokVd7lerVk0ZMmTQmjVrHNcOHjyokydPqnbt2pKk2rVra8+ePS67lK9atUr+/v4qW7asY0zc17CPsb+Gr6+vqlWr5jImNjZWa9ascYwBPIG9v2bGjNKKFdLEiVZXBAAAAAAAkPwsDTX79OmjL7/8UvPmzVO2bNkUHh6u8PBwXbt2TZIUEBCg7t27a+DAgVq3bp127Nihbt26qXbt2qpVq5YkqVmzZipbtqw6deqkP/74QytXrtQbb7yhPn36yM/PT5LUu3dvHTt2TP/73/904MABTZ8+XQsWLNCAAQMctQwcOFCzZs3S3LlztX//fr388su6cuWKunXrlvpfDPAAKlRw9td8/XXpl1+srQcAAAAAACC52QzDMCx7c5vtjtc/++wzde3aVZJ0/fp1DRo0SF9//bWio6MVHBys6dOnuywL/+uvv/Tyyy9r/fr1ypIli7p06aJx48bJx8fHMWb9+vUaMGCA/vzzTxUqVEhvvvmm4z3spk2bpokTJyo8PFyVK1fW1KlTVbNmzQR9lsRsOQ+kNMOQOnWSvvpKKlhQ2rVLyp3b6qoAAAAAAADuLjH5mqWhZlpCqAl3c+mSVL26dOiQ1LKltGSJ5OUWXXQBAAAAAADiS0y+RsQBpFHZsjn7a/70kzRpktUVAQAAAAAAJA9CTSANq1hRmjrVPB4xQvr1V2vrAQAAAAAASA6EmkAa16OH9NxzUkyM1KGDdOGC1RUBAAAAAAA8GEJNII2z2aSPP5Yeflj6+2+pSxcpNtbqqgAAAAAAAJKOUBNIB+z9Nf38pGXLpMmTra4IAAAAAAAg6Qg1gXSiUiVpyhTzePhwafNma+sBAAAAAABIKkJNIB3p1cvsq0l/TQAAAAAA4MkINYF0JG5/zVOnpK5dJcOwuioAAAAAAIDEIdQE0hl/f2nBArO/5tKl0rvvWl0RAAAAAABA4hBqAulQ5crS+++bx8OGSWFhVlYDAAAAAACQOISaQDr10kvSs89Kt26Z/TUvXrS6IgAAAAAAgIQh1ATSKZtNmjlTKlFCOnmS/poAAAAAAMBzEGoC6Zi9v6avr7RkifTee1ZXBAAAAAAAcH+EmkA6V6WKs7/m0KHSli2WlgMAAAAAAHBfhJoA1Lu31K4d/TUBAAAAAIBnINQEIJtNmjVLKl5c+usvqVs3+msCAAAAAAD3RagJQJIUEODsr7l4sXNJOgAAAAAAgLsh1ATgULWq9O675vHQodK2bdbWAwAAAAAAcCeEmgBcvPKK9Mwz0s2bUvv20n//WV0RAAAAAACAK0JNAC5sNumTT6RixeivCQAAAAAA3BOhJoB44vbX/PFHaepUqysCAAAAAABwItQEcEfVqkmTJ5vHQ4bQXxMAAAAAALgPQk0Ad9Wnj/T002Z/zWeflSIirK4IAAAAAACAUBPAPdhs0qefSkWLSidOSC++SH9NAAAAAABgPUJNAPdk76+ZIYO0aJH0wQdWVwQAAAAAANI7Qk0A91W9urO/5uDB0vbt1tYDAAAAAADSN0JNAAnSt6/Uti39NQEAAAAAgPUINQEkiL2/ZpEi0vHjUvfu9NcEAAAAAADWINQEkGDZszv7a37/vTRtmtUVAQAAAACA9IhQE0Ci1KghTZxoHg8eLP32m7X1AAAAAACA9IdQE0Ci9esntWkj3bghtW9Pf00AAAAAAJC6CDUBJJrNJs2e7eyv2aMH/TUBAAAAAEDqIdQEkCQ5ckjz55v9Nb/7Tpo+3eqKAAAAAABAekGoCSDJHnlEmjDBPB44UNqxw9p6AAAAAABA+kCoCeCBvPaa9OSTzv6akZFWVwQAAAAAANI6Qk0AD8Rmkz77TCpcWDp2TOrZk/6aAAAAAAAgZRFqAnhg9v6aPj7SwoXSRx9ZXREAAAAAAEjLCDUBJIuaNaXGjc3jAQOknTtd74eGSiEhqV4WAAAAAABIgwg1ASSbOnXMn/b+mlFR5nloqDRypOTtbV1tAAAAAAAg7fCxugAAacfIkdL169LYsdLRo2Z/zXLlpFGjpDFjpDfftLpCAAAAAACQFhBqAkhW77wjnT8vzZolLVhgXiPQBAAAAAAAyYnl5wCS3cyZrkvNffi/TwAAAAAAQDIi1ASQ7EJDpZgYZ7A5YoQ0fry1NQEAAAAAgLSDUBNAsrJvCjRmjHTrltSkiXl92DBpwgRrawMAAAAAAGkDoSaAZBM30LT30Fy9Wmrc2DweOlSaONG6+gAAAAAAQNpAqAkg2cTE3HlToDVrnMHm//4nTZqU+rUBAAAAAIC0w2YYhmF1EWlBVFSUAgICFBkZKX9/f6vLAdzS6NFSSIh5PGmSNGiQpeUAAAAAAAA3kph8jZmaAFLNqFHmQ5IGD5bee8/aegAAAAAAgGci1ASQqkJCzL6bkjRwIMEmAAAAAABIPEJNAKkuJMTZd3PgQOn9962sBgAAAAAAeBpCTQCpzmYz+2u+8YZ5PmCANGWKtTUBAAAAAADPQagJwBI2m7lT+ogR5nn//tLUqZaWBAAAAAAAPAShJgDL2GzSW29Jw4eb56+9Jk2bZm1NAAAAAADA/RFqArCUzSa9/bY0bJh5/uqr0ocfWlsTAAAAAABwb4SaACxns0nvvCMNHWqe9+0rTZ9ubU0AAAAAAMB9EWoCcAs2mzR2rPS//5nnffpIH31kbU0AAAAAAMA9EWoCcBs2mzRunDR4sHn+yivSjBnW1gQAAAAAANwPoSYAt2KzSRMmSIMGmecvvyx9/LG1NQEAAAAAAPdCqAnA7dhs0sSJ0sCB5nnv3tLMmdbWBAAAAAAA3AehJgC3ZLNJkyZJAwaY5y+9JM2aZW1NAAAAAADAPRBqAnBbNps0ebLUv7953quX9MknlpYEAAAAAADcAKEmALdms0nvviu99pp53rOn9Omn1tYEAAAAAACsRagJwO3ZbNJ770n9+pnnPXtKs2dbWxMAAAAAALAOoSYAj2CzSe+/L736qmQYUo8e0pw5VlcFAAAAAACsQKgJwGPYbNKUKVLfvmaw+eKL0ty5VlcFAAAAAABSG6EmAI9is0lTp0p9+pjBZrduBJsAAAAAAKQ3hJoAPI7NJn3wgfTyy85g8/PPra4KAAAAAACkFkJNAB7JZpOmTZN69zaDza5dpS+/tLoqAAAAAACQGgg1AXgsLy/pww+ll14yg80uXaSvvrK6KgAAAAAAkNIINQF4NC8vafp0qVcvKTZW6txZmjfP6qoAAAAAAEBK8rG6AAB4UF5e0kcfmbM1Z82SOnUyr3fsaG1dAAAAAAAgZTBTE0Ca4OUlzZgh9ehhztjs1En6+murqwIAAAAAACmBUBNAmuHlJX38sdS9uxlsvvCC9M03VlcFAAAAAACSG6EmgDTFy0uaOVN68UUz2Hz+eWn+fKurAgAAAAAAyYlQE0Ca4+Vl9tbs1s0ZbC5caHVVAAAAAAAgubBREIA0yctL+uQTc/OgOXOk554zr7drZ2lZAAAAAAAgGRBqAkiz4gabc+eawabNJj3zjNWVAQAAAACAB8HycwBpmre39OmnUufOUkyM1KGD9N13VlcFAAAAAAAeBKEmgDTP21uaPVvq1MkZbH7/vdVVAQAAAACApCLUBJAueHtLn30mvfCCdOuW9Oyz0qJFVlcFAAAAAACSglATQLrh7W1uGvT882aw2b699MMPVlcFAAAAAAASi1ATQLri7W1uGtSxoxlstmsn/fij1VUBAAAAAIDEINQEkO7Yg83nnnMGm4sXW10VAAAAAABIKEJNAOmSj4/0+efmpkE3b0rPPCMtWWJ1VQAAAAAAICEINQGkWz4+0hdfmJsG3bwpPf20tHSp1VUBAAAAAID7IdQEkK75+EhffmluGmQPNpcts7oqAAAAAABwL4SaANI9Hx/pq6/M3po3bkht20o//WR1VQAAAAAA4G4INQFAzmDzmWfMYPOppwg2AQAAAABwV4SaAPD/MmSQ5s0zl6Dbg83ly62uCgAAAAAA3I5QEwDiyJBB+vprcwm6PdhcscLqqgAAAAAAQFyEmgBwmwwZpG++MQPN6GipTRtp5UqrqwIAAAAAAHaEmgBwB/Zgs00bM9h88knp55+trgoAAAAAAEiEmgBwV76+0vz5ZqBpDzZXrbK6KgAAAAAAQKgJAPfg6ystWCC1bi1dv27+XL3a6qoAAAAAAEjfCDUB4D58faWFC6UnnjCDzSeekNassboqAAAAAADSL0JNAEgAe7D5+OPOYHPtWqurAgAAAAAgfSLUBIAE8vOTvv1WatVKunbNDDgJNgEAAAAASH2EmgCQCH5+0nffuQab69ZZXRUAAAAAAOkLoSYAJJI92GzZ0gw2W7WS1q+3uioAAAAAANIPQk0ASAJ7sNm8uTPY3LDB6qoAAAAAAEgfCDUBIIkyZpQWLTKDzatXzZmbGzdaXRUAAAAAAGkfoSYAPAB7sBkc7Aw2N22yuioAAAAAANI2Qk0AeEAZM0o//CA1ayZduSK1aEGwCQAAAABASkpSqHnmzBl16tRJgYGB8vHxkbe3t8sDANIbe7D52GPOYPOXX6yuCgAAAACAtMknKU/q2rWrTp48qTfffFMFChSQzWZL7roAwONkyiT9+KPUurW0erUZbK5YIdWpY3VlAAAAAACkLTbDMIzEPilbtmzatGmTKleunAIleaaoqCgFBAQoMjJS/v7+VpcDwEJXr5rB5po1Utas0sqV0qOPWl0VAAAAAADuLTH5WpKWnwcFBSkJWSgApAuZM0uLF0uNG0uXL5u7o4eFWV0VAAAAAABpR5JCzffff1/Dhg3TiRMnHujNN27cqCeeeEKBgYGy2Wz64YcfXO537dpVNpvN5dG8eXOXMRcvXtTzzz8vf39/Zc+eXd27d9fly5ddxuzevVv16tVTxowZFRQUpAkTJsSrZeHChSpdurQyZsyoChUq6KeffnqgzwYgfcucWVqyxAw2L10yd0cn2AQAAAAAIHkkKdR89tlntX79ehUvXlzZsmVTzpw5XR4JdeXKFVWqVEkffvjhXcc0b95c//77r+Px9ddfu9x//vnntW/fPq1atUpLly7Vxo0b1atXL8f9qKgoNWvWTIULF9aOHTs0ceJEhYSEaObMmY4xmzdv1nPPPafu3bvr999/V5s2bdSmTRvt3bs3Ed8KALiyB5uNGjmDzS1brK4KAAAAAADPl6SemnPnzr3n/S5duiS+EJtNixYtUps2bRzXunbtqoiIiHgzOO3279+vsmXLavv27apevbokacWKFWrZsqX+/vtvBQYG6qOPPtLrr7+u8PBw+fr6SpKGDRumH374QQcOHJBkhrRXrlzR0qVLHa9dq1YtVa5cWTNmzEhQ/fTUBHA3V65Ijz8urV8v+ftLP/8s1axpdVUAAAAAALiXxORrSdr9PCmhZVKtX79eefPmVY4cOdS4cWO99dZbypUrlyQpLCxM2bNndwSaktS0aVN5eXlp69ateuqppxQWFqb69es7Ak1JCg4O1vjx4/Xff/8pR44cCgsL08CBA13eNzg4+K5hqiRFR0crOjracR4VFZVMnxhAWpMli7R0qdSqlbRhg9SsmbRqlfTII1ZXBgAAAACAZ0pSqClJMTEx+uGHH7R//35JUrly5dS6dWt5e3snW3HNmzdX27ZtVbRoUR09elQjRoxQixYtFBYWJm9vb4WHhytv3rwuz/Hx8VHOnDkVHh4uSQoPD1fRokVdxuTLl89xL0eOHAoPD3dcizvG/hp3MnbsWI0ePTo5PiaAdCBLFmnZMqllS2njRmewWaOG1ZUBAAAAAOB5khRqHjlyRC1bttQ///yjUqVKSTJDvqCgIC1btkzFixdPluI6dOjgOK5QoYIqVqyo4sWLa/369WrSpEmyvEdSDR8+3GV2Z1RUlIKCgiysCIC7ixtsbtokPfYYwSYAAAAAAEmRpI2C+vXrp+LFi+vUqVPauXOndu7cqZMnT6po0aLq169fctfoUKxYMeXOnVtHjhyRJOXPn19nz551GXPr1i1dvHhR+fPnd4w5c+aMyxj7+f3G2O/fiZ+fn/z9/V0eAHA/WbNKP/0k1asnRUaaweZvv1ldFQAAAAAAniVJoeaGDRs0YcIEl53Oc+XKpXHjxmnDhg3JVtzt/v77b124cEEFChSQJNWuXVsRERHasWOHY8zatWsVGxurmv+/C0ft2rW1ceNG3bx50zFm1apVKlWqlHLkyOEYs2bNGpf3WrVqlWrXrp1inwVA+mUPNuvWdQabcf5jDAAAAAAA3EeSQk0/Pz9dunQp3vXLly+7bMhzP5cvX9auXbu0a9cuSdLx48e1a9cunTx5UpcvX9aQIUO0ZcsWnThxQmvWrNGTTz6pEiVKKDg4WJJUpkwZNW/eXD179tS2bdv066+/qm/fvurQoYMCAwMlSR07dpSvr6+6d++uffv2af78+ZoyZYrL0vHXXntNK1as0OTJk3XgwAGFhITot99+U9++fZPy9QDAfdmDzTp1pIgIqWlTaedOq6sCAAAAAMAzJCnUfPzxx9WrVy9t3bpVhmHIMAxt2bJFvXv3VuvWrRP8Or/99puqVKmiKlWqSJIGDhyoKlWqaOTIkfL29tbu3bvVunVrlSxZUt27d1e1atW0adMm+fn5OV7jq6++UunSpdWkSRO1bNlSdevW1cyZMx33AwIC9PPPP+v48eOqVq2aBg0apJEjR6pXr16OMY8++qjmzZunmTNnqlKlSvr222/1ww8/qHz58kn5egAgQbJlk5Yvlx59lGATAAAAAIDEsBmGYST2SREREerSpYuWLFmiDBkySDJ7WbZu3Vpz5sxRQEBAshfq7qKiohQQEKDIyEj6awJIlKgoqXlzKSxMypFDWrNG+v//rwcAAAAAgHQjMflakkJNu8OHD+vAgQOSzKXgJUqUSOpLeTxCTQAPIm6wmTOnGWxWrmx1VQAAAAAApJ5UCzXhRKgJ4EFFRUnBwdKWLQSbAAAAAID0JzH5mk9CX3TgwIEKDQ1VlixZXDbZuZN33303oS8LAPh//v7SihVmsLl1q9SkibR2rVSpktWVAQAAAADgXhIcav7++++6efOm4xgAkPwCAqSVK6VmzaRt25zBZsWKVlcGAAAAAID7YPl5MmH5OYDkFBFhBpvbt0u5c5vBZoUKVlcFAAAAAEDKSUy+5pWUN3jxxRd16dKleNevXLmiF198MSkvCQCII3t26eefpRo1pPPnpcaNpT17rK4KAAAAAAD3kKRQc+7cubp27Vq869euXdPnn3/+wEUBAJzBZvXqzmBz716rqwIAAAAAwHqJCjWjoqIUGRkpwzB06dIlRUVFOR7//feffvrpJ+XNmzelagWAdMcebFarRrAJAAAAAIBdgjcKkqTs2bPLZrPJZrOpZMmS8e7bbDaNHj062YoDAEg5ckirVklNm0o7d5rB5rp1UrlyVlcGAAAAAIA1EhVqrlu3ToZhqHHjxvruu++UM2dOxz1fX18VLlxYgYGByV4kAKR3cYPN3393Bptly1pdGQAAAAAAqS9RoWaDBg1069YtdenSRdWrV1dQUFBK1QUAuE3OnNLq1fGDzTJlrK4MAAAAAIDUleiNgnx8fPTtt98qJiYmJeoBANyDPdisXFk6c0Zq1Ejav9/qqgAAAAAASF1J2v28cePG2rBhQ3LXAgBIAHuwWamSM9g8cMDqqgAAAAAASD2JWn5u16JFCw0bNkx79uxRtWrVlCVLFpf7rVu3TpbiAAB3liuXGWw2aSLt3m0Gm+vXS6VKWV0ZAAAAAAApz2YYhpHYJ3l53X2Cp81mS5dL06OiohQQEKDIyEj5+/tbXQ6AdOL8eWewWaCA2WOTYBMAAAAA4IkSk68lafl5bGzsXR/pMdAEAKvkzi2tWSNVqCD9+685Y/PQIaurAgAAAAAgZSUp1AQAuI87BZuHD1tdFQAAAAAAKSfJoeaGDRv0xBNPqESJEipRooRat26tTZs2JWdtAIAEypPHDDbLl5dOn5YaNiTYBAAAAACkXUkKNb/88ks1bdpUmTNnVr9+/dSvXz9lypRJTZo00bx585K7RgBAAtiDzXLlzGCzUSPpyBGrqwIAAAAAIPklaaOgMmXKqFevXhowYIDL9XfffVezZs3S/v37k61AT8FGQQDcxdmzZqD5559SoULmrujFi1tdFQAAAAAA95biGwUdO3ZMTzzxRLzrrVu31vHjx5PykgCAZJI3r7R2rVS2rPT33+ZS9KNHra4KAAAAAIDkk6RQMygoSGvWrIl3ffXq1QoKCnrgogAADyZfPjPYLFPGDDYbNZKOHbO6KgAAAAAAkodPUp40aNAg9evXT7t27dKjjz4qSfr11181Z84cTZkyJVkLBAAkjT3YbNRIOnDAnLG5fr1UrJjVlQEAAAAA8GCS1FNTkhYtWqTJkyc7+meWKVNGQ4YM0ZNPPpmsBXoKemoCcFfh4c5g86GHzGCzaFGrqwIAAAAAwFVi8rUkh5pwRagJwJ39+68ZbB48aAabGzZIRYpYXRUAAAAAAE4pvlFQsWLFdOHChXjXIyIiVIx1jQDgdgoUkNatk0qWlE6eNJeinzhhdVUAAAAAACRNkkLNEydOKCYmJt716Oho/fPPPw9cFAAg+cUNNv/6y5y5+ddfVlcFAAAAAEDiJWqjoMWLFzuOV65cqYCAAMd5TEyM1qxZoyKsZwQAtxUYaAabDRtKhw87Nw8qXNjiwgAAAAAASIRE9dT08jIndtpsNt3+tAwZMqhIkSKaPHmyHn/88eSt0gPQUxOAJ/nnHzPQPHLE3DRo/Xqz1yYAAAAAAFZJTL6WqJmasbGxkqSiRYtq+/btyp07d9KrBABYpmBB54zNo0fNnxs2SEFBVlcGAAAAAMD9Jamn5vHjx+MFmhEREclRDwAglRQqZM7QLF5cOn7cDDZPnbK6KgAAAAAA7i9Joeb48eM1f/58x3m7du2UM2dOFSxYUH/88UeyFQcASFmFCpkzNosVk44dMzcP+vtvq6sCAAAAAODekhRqzpgxQ0H/v0Zx1apVWr16tVasWKEWLVpoyJAhyVogACBlBQWZMzaLFXMuRSfYBAAAAAC4s0T11LQLDw93hJpLly5V+/bt1axZMxUpUkQ1a9ZM1gIBACkvKMi1x2ajRmbQWbCg1ZUBAAAAABBfkmZq5siRQ6f+v/HaihUr1LRpU0mSYRiKiYlJvuoAAKnmoYfMILNoUXNX9EaNzF3SAQAAAABwN0kKNdu2bauOHTvqscce04ULF9SiRQtJ0u+//64SJUoka4EAgNTz0EPmjM0iRaTDh81g8/Rpq6sCAAAAAMBVkkLN9957T3379lXZsmW1atUqZc2aVZL077//6pVXXknWAgEAqatwYTPYLFyYYBMAAAAA4J5shmEYVheRFkRFRSkgIECRkZHy9/e3uhwAeGAnTpg9Nv/6SypZ0lyaXqCAxUUBAAAAANKsxORrCd4oaPHixWrRooUyZMigxYsX33Ns69atE/qyAAA3VaSIGWQ2aCAdOmTO2Fy3jmATAAAAAGC9BM/U9PLyUnh4uPLmzSsvr7uvWrfZbOlysyBmagJIq44fN4PNU6ek0qXNYDN/fqurAgAAAACkNYnJ1xLcUzM2NlZ58+Z1HN/tkR4DTQBIy4oWNWdsBgVJBw5IjRtLZ85YXRUAAAAAID1L9EZBsbGxmj17th5//HGVL19eFSpU0JNPPqnPP/9ctOcEgLSpWDFzhmahQtL+/eZSdIJNAAAAAIBVEhVqGoah1q1bq0ePHvrnn39UoUIFlStXTidOnFDXrl311FNPpVSdAACLFS9uzti0B5vM2AQAAAAAWCXBGwVJ0pw5c7Rx40atWbNGjRo1crm3du1atWnTRp9//rk6d+6crEUCANxD8eLmjM2GDaU//zSDzXXrpP/vTgIAAAAAQKpI1EzNr7/+WiNGjIgXaEpS48aNNWzYMH311VfJVhwAwP2UKGHO2AwMdAabZ89aXRUAAAAAID1JVKi5e/duNW/e/K73W7RooT/++OOBiwIAuLe4wea+fVKTJtK5c1ZXBQAAAABILxIVal68eFH58uW76/18+fLpv//+e+CiAADu7+GHzaXnBQpIe/cSbAIAAAAAUk+iQs2YmBj5+Ny9Dae3t7du3br1wEUBADxDyZLmjM0CBaQ9e8xg8/x5q6sCAAAAAKR1idooyDAMde3aVX5+fne8Hx0dnSxFAQA8R8mS5ozNRo2cweaaNVLu3FZXBgAAAABIqxIVanbp0uW+Y9j5HADSn1KlnLui794tNW1qBpu5clldGQAAAAAgLbIZhmFYXURaEBUVpYCAAEVGRsrf39/qcgDAEgcOmMHmmTNS5crS6tUEmwAAAACAhElMvpaonpoAANxL6dLmjM18+aRdu6THHpMuXrS6KgAAAABAWkOoCQBIVmXKSGvXSnnzSr//bi5FJ9gEAAAAACQnQk0AQLIrW9acsWkPNpmxCQAAAABIToSaAIAUUbasOWMzTx5p504z2PzvP6urAgAAAACkBYSaAIAUU65c/GAzIsLqqgAAAAAAno5QEwCQosqXN4PN3LmlHTsINgEAAAAAD45QEwCQ4uIGm7/9JjVrRrAJAAAAAEg6Qk0AQKqoUEFas0bKlUvavl0KDpYiI62uCgAAAADgiQg1AQCppmJFc8ZmrlzStm3mjE2CTQAAAABAYhFqAgBSVcWKzhmb27aZS9PvFGyGhkohIaleHgAAAADAAxBqAgBSXaVK0urVUqZM0t9/m8FmVJTzfmioNHKk5O1tXY0AAAAAAPdFqAkAsETlytKvv0oZM7oGm/ZAc8wY6c03ra4SAAAAAOCOfKwuAACQflWpIm3eLD36qHTqlJQ9u2QY0rBhBJoAAAAAgLtjpiYAwFJVqpgzNiUz0JSkceOksmWlXr2kL76Qjh933gMAAAAAgJmaAADLLVtm/vTykmJjzeP9+83HrFnmecGCUt26Ur165s/y5em5CQAAAADpFTM1AQCWittDMybG/ClJHTtKQ4ZItWpJPj7SP/9I8+dLffua/Thz5ZJatpTGjpU2bZKuX7f0YwAAAAAAUhEzNQEAlrnTpkD2n/brYWHS1avStm3SL7+YAebmzVJkpLR8ufmQJF9fqUYN52zORx+VcuSw5nMBAAAAAFKWzTDoUpYcoqKiFBAQoMjISPn7+1tdDgB4hJAQcwn5nTYFCg01Z26GhMS/d+uWtHu3GXLag87wcNcxNpu5RD3ukvWgoJT4FAAAAACA5JCYfI1QM5kQagKAdQxDOnbMDDftIeehQ/HHFS7sGnKWKWP28QQAAAAAWI9Q0wKEmgDgXs6edZ3J+fvv5szPuHLmlOrUcYac1aqZy9gBAAAAAKmPUNMChJoA4N4uX5a2bHGGnFu2mL0648qYUapZ0xly1q4t8R/pAAAAAJA6CDUtQKgJAJ7l5k1z9mbc2Zznz7uO8fKSKlVyhpx160oFClhTLwAAAACkdYSaFiDUBADPZhhmH864fTmPHYs/rnhx176cJUuamxIBAAAAAB4MoaYFCDUBIO05fdp1Jucff5jhZ1x58riGnFWqSD4+1tQLAAAAAJ6MUNMChJoAkPZFRkphYc6Qc+tWKTradUyWLFKtWs6Qs1Yt8xoAAAAA4N4INS1AqAkA6U90tLRjhzPk/PVX6b//XMd4e0tVqzpDzjp1pLx5rakXAAAAANwZoaYFCDUBALGx0p9/OkPOX36RTp6MP65UKdcl68WK0ZcTAAAAAAg1LUCoCQC4k5MnXfty7t0bf0yBAq4hZ8WK5gxPAAAAAEhPCDUtQKgJAEiIixelzZudIef27dLNm65jsmWTHn3UGXI+8oiUKZM19QIAAABAaiHUtAChJgAgKa5dM4NNe8i5ebMUFeU6JkMGqXp1176cOXNaUy8AAAAApBRCTQsQagIAkkNMjLRnjzPk3LRJ+vff+OPKlXNdsl64cOrXCgAAAADJiVDTAoSaAICUYBjS8eOufTkPHIg/LijINeQsV07y8kr9egEAAAAgqQg1LUCoCQBILefOSb/+6gw5d+6Ubt1yHZM9u7lM3R5yVq8u+flZUi4AAAAAJAihpgUINQEAVrlyRdq61RlyhoWZ1+Ly8zM3HLKHnI8+KgUEWFMvAAAAANwJoaYFCDUBAO7i1i1p1y5nyPnLL9LZs65jbDapYkVnyFmvnhQYaEm5AAAAACCJUNMShJoAAHdlGNLhw659OY8ciT+uaFHXvpylS5vhJwAAAACkBkJNCxBqAgA8SXi4a8i5a5cUG+s6Jlcu15CzalUpQwZLygUAAACQDhBqWoBQEwDgyS5dMntx2kPOrVula9dcx2TKJNWq5Qw5a9eWsma1pl4AAAAAaQ+hpgUINQEAacmNG+au6nH7cl686DrG21uqXNkZctatK+XLZ0m5AAAAANIAQk0LEGoCANKy2FjpwAHXJesnTsQf9/DDrkvWS5SgLycAAACAhCHUtAChJgAgvfn7b9eQc88ec1OiuPLlcw05K1WSfHysqRcAAACAeyPUtAChJgAgvYuIkDZvdoac27aZy9jjyprV7MVpDzlr1pQyZ7akXAAAAABuhlDTAoSaAAC4un5d+u03Z8j5669SZKTrGB8fqVo1Z8hZp46UO7c19QIAAACwFqGmBQg1AQC4t9hYae9e1yXrf/8df1yZMs6Qs149qXBhZ1/OkBBzg6I334z/vNBQKSbGHAMAAADA8yQmX6OrFQAASBVeXlLFiubjlVfM/psnTzp3V9+0SfrzT2n/fvMxc6b5vIIFnSHnuXPS9Onm9bjBZmioNHKkNGZM6n8uAAAAAKmPmZrJhJmaAAA8uAsXzGXq9pDzt9+kW7dcx/j5SdHRUtOmZpj588/SqFFmoHmnGZwAAAAAPAPLzy1AqAkAQPK7etXccMgecm7eLF2+HH9c+/bSvHnm0nQAAAAAnolQ0wKEmgAApLxbt6Tdu82Qc8AAs0+nXYkS0uDBUpcuUsaM1tUIAAAAIGkSk695pVJNAAAAD8zHR6pa1dxFPTZW8vU1r2fKJB05IvXuLRUpIo0dK0VEWFkpAAAAgJREqAkAADxK3E2BoqPNn9euSS1aSEFB0pkz0ogR0kMPSUOGSP/8Y3XFAAAAAJIboSYAAPAYcQNN+6ZAb75pni9fLr34ovT551L58tKlS9KkSVLRoub1/futrR0AAABA8iHUBAAAHiMm5s67nNuDTUnq1Mnsu7l0qVS/vnTzpvTZZ1LZslKbNlJYWKqXDQAAACCZsVFQMmGjIAAA3NOWLdL48dIPPziv1asnDR0qtWwp2WyWlQYAAAAgDjYKAgAA+H+1akmLFpnLz198UcqQQdq0SXr8caliRemLL8zZnAAAAAA8B6EmAABIF0qXlj79VDp+XBo8WMqWTdq7V+rcWSpeXHr/fenyZaurBAAAAJAQhJoAACBdKVhQmjhROnlSeucdKV8+6dQpacAAqXBhcyOic+esrhIAAADAvVgaam7cuFFPPPGEAgMDZbPZ9EPcZleSDMPQyJEjVaBAAWXKlElNmzbV4cOHXcZcvHhRzz//vPz9/ZU9e3Z1795dl2+bZrF7927Vq1dPGTNmVFBQkCZMmBCvloULF6p06dLKmDGjKlSooJ9++inZPy8AAHAf2bNLw4dLJ05IM2ZIJUpIFy+aO6wXLiz17WvO6gQAAADgfiwNNa9cuaJKlSrpww8/vOP9CRMmaOrUqZoxY4a2bt2qLFmyKDg4WNevX3eMef7557Vv3z6tWrVKS5cu1caNG9WrVy/H/aioKDVr1kyFCxfWjh07NHHiRIWEhGjmzJmOMZs3b9Zzzz2n7t276/fff1ebNm3Upk0b7d27N+U+PAAAcAsZM0ovvSQdOCAtXChVry5duyZ9+KH08MNSx47SH39YXSUAAACAuNxm93ObzaZFixapTZs2ksxZmoGBgRo0aJAGDx4sSYqMjFS+fPk0Z84cdejQQfv371fZsmW1fft2Va9eXZK0YsUKtWzZUn///bcCAwP10Ucf6fXXX1d4eLh8fX0lScOGDdMPP/ygAwcOSJKeffZZXblyRUuXLnXUU6tWLVWuXFkzZsxIUP3sfg4AQNpgGNK6deaO6T//7LweHGzumN6wITumAwAAACkhTex+fvz4cYWHh6tp06aOawEBAapZs6bCwsIkSWFhYcqePbsj0JSkpk2bysvLS1u3bnWMqV+/viPQlKTg4GAdPHhQ//33n2NM3Pexj7G/z51ER0crKirK5QEAADyfzSY1biytXCnt3Cl16CB5eZnnjRtLNWtK330nxcRYXSkAAACQfrltqBkeHi5Jypcvn8v1fPnyOe6Fh4crb968Lvd9fHyUM2dOlzF3eo2473G3Mfb7dzJ27FgFBAQ4HkFBQYn9iAAAwM1VqSJ9/bV0+LD0yivmUvXt26VnnpHKlJFmzZLidMUBAAAAkErcNtR0d8OHD1dkZKTjcerUKatLAgAAKaRYMbPH5l9/SW+8IeXIYQadvXpJRYtK48ZJkZFWVwkAAACkH24baubPn1+SdObMGZfrZ86ccdzLnz+/zp4963L/1q1bunjxosuYO71G3Pe42xj7/Tvx8/OTv7+/ywMAAKRtefOau6OfPCm9954UFCSFh5u7qAcFSf/7n3T6tNVVAgAAAGmf24aaRYsWVf78+bVmzRrHtaioKG3dulW1a9eWJNWuXVsRERHasWOHY8zatWsVGxurmjVrOsZs3LhRN2/edIxZtWqVSpUqpRw5cjjGxH0f+xj7+wAAAMSVNavUv7909Kg0d65Urpx06ZI0caI5c7NHD+ngQaurBAAAANIuS0PNy5cva9euXdq1a5ckc3OgXbt26eTJk7LZbOrfv7/eeustLV68WHv27FHnzp0VGBjo2CG9TJkyat68uXr27Klt27bp119/Vd++fdWhQwcFBgZKkjp27ChfX191795d+/bt0/z58zVlyhQNHDjQUcdrr72mFStWaPLkyTpw4IBCQkL022+/qW/fvqn9lQAAAA+SIYPUubO0e7e0ZIlUt65044b06admz822baX/37sQAAAAQDKyGYZhWPXm69evV6NGjeJd79Kli+bMmSPDMDRq1CjNnDlTERERqlu3rqZPn66SJUs6xl68eFF9+/bVkiVL5OXlpaefflpTp05V1qxZHWN2796tPn36aPv27cqdO7deffVVDR061OU9Fy5cqDfeeEMnTpzQww8/rAkTJqhly5YJ/iyJ2XIeAACkXZs3S+PHS4sXO681aCANHSo1b27urg4AAAAgvsTka5aGmmkJoSYAAIhr/35zOfqXX0r2LjgVKph9N5991pzlCQAAAMApMfma2/bUBAAA8GRlykizZ0vHjkmDBpl9OPfskTp1kkqUkKZOla5csbpKAAAAwDMRagIAAKSgQoWkSZPMHdPfftvcQf3kSem116TChaWQEOn8eaurBAAAADwLoSYAAEAqyJFDGjFCOnFC+ugjqXhx6cIFafRo6aGHpH79zHsAAAAA7o9QEwAAIBVlyiT17i0dPCgtWCBVqyZduyZ98IG5LP35583d1AEAAADcHaEmAACABby9pXbtpO3bpdWrpccek2JipHnzpEqVpBYtpPXrJbZ0BAAAAOIj1AQAALCQzSY1aSL9/LO0Y4e5M7qXl7RihdSokVSrlvT992bgCQAAAMBEqAkAAOAmqlaVvvlGOnRIevllKWNGads26emnpbJlpU8+kaKjra4SAAAAsB6hJgAAgJspXlyaPl366y/p9del7NnNoLNnT6lIEWn8eCky0uoqAQAAAOsQagIAALipvHmlt96STp6U3n1XKlRICg+Xhg0zd0wfOlT691+rqwQAAABSH6EmAACAm8uWTRowQDp6VJozx1yKHhUlTZhgztzs2dOcyQkAAACkF4SaAAAAHsLXV+rSRdqzR1q8WKpTR7pxw+y1Wbq02Xtz2zarqwQAAABSHqEmAACAh/Hykp54QvrlF/PxxBOSYZi7pNesae6avmKFeQ0AAABIiwg1AQAAPFidOuaszb17zVmcPj7S+vVSixZS5crSV19Jt25ZXSUAAACQvAg1AQAA0oBy5cx+m8eOSQMHSlmzSrt3Sy+8IJUoIX3wgXT1qtVVAgAAAMmDUBMAACANCQqSJk82d0x/6y0pTx7pr7+kfv3MHdNHj5YuXLC6SgAAAODBEGoCAACkQTlySK+/bgaa06dLxYqZYWZIiBluvvaaeQ8AAADwRISaAAAAaVimTNLLL0sHD0rffCNVqWIuQ586VSpeXOrUydxNHQAAAPAkhJoAAADpgI+P9Oyz0o4d0s8/S02aSDEx0pdfShUrSq1aSRs3smM6AAAAPAOhJgAAQDpis0mPPSatXi399pvUrp3k5SX99JPUoIH06KPSokVSbKzVlQIAAAB3R6gJAACQTlWrJi1YYC5N791b8vOTtmyR2raVypaVPv1Uio62ukoAAAAgPkJNAACAdK5ECemjj8yNg0aMkLJnN4POHj3MDYYmTpSioqyuEgAAAHAi1AQAAIAkKV8+6e23pZMnpUmTpIIFpdOnpf/9z9wxffhwKTzc6ioBAAAAQk0AAADcJls2adAg6dgxafZsqXRpKTJSGjdOKlJEeukl6fBhq6sEAABAekaoCQAAgDvy9ZW6dZP27ZN+/NHcRCg6Wpo5UypVytxkaPt2q6sEAABAekSoCQAAgHvy8pJat5Z+/VXatEl6/HHJMKRvv5UeeURq3FhaudK8BgAAAKQGQk0AAAAkWN260pIl0p49UufOko+PtG6d1Ly5VLWq9PXX0q1bVlcJAACAtI5QEwAAAIlWvrw0d6509KjUv7+UJYu0a5fUsaNUsqT04YfS1atWVwkAAIC0ilATAAAASfbQQ9J775k7poeGSnnySMePS337SoULm9cuXLC6SgAAAKQ1hJoAAAB4YDlzSm+8IZ04Yc7SLFpUOn9eGjnSDDf79zeDTwAAACA5EGoCAAAg2WTOLL3yinTokNlfs3Jl6coVacoUqXhxsw/n3r1WVwkAAABPR6gJAACAZOfjI3XoIO3cae6M3rixuYHQF19IFSqYO6hv2sSO6QAAAEgaQk0AAACkGJtNatZMWrNG2rZNeuYZ89qyZVL9+lKdOtKPP0qxsVZXCgAAAE9CqAkAAIBUUaOGtHChuTT9pZckPz8pLExq00YqV06aPVu6ccPqKgEAAOAJCDUBAACQqkqUkGbMMDcVGj5cCgiQDhyQunc3NxiaNEmKirK6SgAAALgzQk0AAABYIn9+6Z13zF3RJ06UAgOl06elIUOkhx6SRoyQzpyxukoAAAC4I0JNAAAAWMrfXxo8WDp2TPr0U6lUKSkyUho7VipcWOrdWzpyxOoqAQAA4E4INQEAAOAW/PykF1+U/vxTWrRIqlVLio6WPv7YDDrbt5d27LC6SgAAALgDQk0AAAC4FS8vc/OgzZuljRulVq3M3dEXLpSqV5eaNJF+/lkyDKsrBQAAgFUINQEAAOCWbDapXj1p6VJp926pUyfJx0dau1YKDpaqVZO++Ua6dcvqSgEAAJDaCDUBAADg9ipUkD7/XDp6VHrtNSlzZun336XnnjOXpk+fLl27ZnWVAAAASC2EmgAAAPAYDz0kvf++uWP66NFS7tzmBkN9+pibCr31lnTxotVVAgAAIKXZDINuRMkhKipKAQEBioyMlL+/v9XlAAAApAtXr0qzZ0uTJ0snTpjXsmSRevWSBgwwd1P39pbefDP+c0NDpZgYKSQkNSsGAADA3SQmX2OmJgAAADxW5sxS377S4cPSvHlSpUrSlSvSe+9JxYpJP/4ojRxpBphxhYaa1729rakbAAAAD4ZQEwAAAB7Px8fsr/n779KKFVKjRuYGQrt2mfdHjpR69DCP7YHmmDF3nsEJAAAA98fy82TC8nMAAAD3sn27NH689P33kv2/8dps5vEzz0jjxpmzOW02a+sEAACAKTH5GqFmMiHUBAAAcE+HDkmTJkmzZsW/lzOn9Mgjro88eVK/RgAAACQuX/NJpZoAAAAAS5QsKQUFmcc+Puay9EKFpHPnzJ3SV6wwH3ZFikg1azpDzqpVzd6dAAAAcB/01AQAAECaFreH5s2b5s+//5aGDTOXqH/4odSli1SmjLkU/cQJaf58adAgqV49yd9fqlzZ3FH9k0+k3bvNXdMBAABgHZafJxOWnwMAALifu20KdLfrkZHSjh3Stm3mY+tW6fTp+K+bJYtUrZrrsvWHHqI/JwAAwINg+TkAAAAgc0blnXY5t5/fPuMyIEBq3Nh82P3zjzPk3LbNnN156ZK0caP5sMub1zXkrFHD7NkJAACA5MdMzWTCTE0AAID0ITZWOnjQOZNz2zbpjz/MXp23e/hh16CzcmUpY8ZULxkAAMAjsPu5BQg1AQAA0q/r16Vdu1xndB4+HH9chgxSpUquQWepUpIXne4BAAAINa1AqAkAAIC4Ll40l6rH7c957lz8cf7+UvXqrkFnwYKpXy8AAIDVCDUtQKgJAACAezEM6eRJ19mcv/0mXb0af2zBgq4hZ/XqZvgJAACQlhFqWoBQEwAAAIl165b055+uQeeePWbfzrhsNql0adegs2JFydfXmroBAABSAqGmBQg1AQAAkByuXJF+/9112fqJE/HH+flJVaq4Bp0lSpgBKAAAgCci1LQAoSYAAABSytmzrv05t20ze3beLkcOqUYNM+CsWdM8zpcv9esFAABICkJNCxBqAgAAILUYhnTsmHMm57Zt0s6dUnR0/LGFC7vO5qxaVcqaNfVrBgAAuB9CTQsQagIAAMBKN2+a/Tjjzub8808zAI3Ly0sqV845m/ORR8xzHx9r6gYAALAj1LQAoSYAAADczaVL5g7rcYPOv/+OPy5TJqlaNdcZnUWK0J8TAACkLkJNCxBqAgAAwBOcPh2/P2dUVPxxefK4hpw1aki5cqV+vQAAIP0g1LQAoSYAAAA8UWysdOiQa8i5a5e5nP12xYs7Q86aNaXKlc1ZngAAAMmBUNMChJoAAABIK6KjpT/+cN2I6NCh+ON8fKSKFV1ndJYuLXl7p37NAADA8xFqWoBQEwAAAGnZf/+59ufculU6cyb+uKxZperVnZsQPfKIVLAg/TkBAMD9EWpagFATAAAA6YlhmJsOxZ3N+dtv0pUr8ccWKOA6m7N6dSl79lQvGQAAuDlCTQsQagIAACC9i4mR9u937c+5e7d5/XalSrnO5qxYUfLzS/2aAQCA+yDUtAChJgAAABDf1avS77+7Bp3HjsUf5+trbjwUd0bnww9LXl6pXjIAALAIoaYFCDUBAACAhDl/Xtq+3XXp+oUL8cdlzy7VqOEadObPn+rlAgCAVEKoaQFCTQAAACBpDEM6ftx1NueOHdL16/HHBgU5A86aNaVq1czNiQAAgOcj1LQAoSYAAACQfG7elPbtc87k3LbNPL/9f714eUlly7rO5ixfXsqQwZq6AQBA0hFqWoBQEwAAAEhZly5JO3e6zug8eTL+uIwZpapVXTciKlpUstlSv2YAAJBwhJoWINQEAAAAUl94uGvIuW2bFBkZf1yuXK6zOWvUkPLkiT8uJETy9pbefDP+vdBQcyf3kJDk/hQAAEBKXL7mk0o1AQAAAECyy59fat3afEhSbKx05IhryPn77+ZGRMuXmw+7YsVcg84qVcxAc+RI837cYDM01Lw+ZkzqfTYAAHB3zNRMJszUBAAAANxTdLS0e7dr0HngQPxx3t5ShQqSj4/0229Snz7SlCnSO+84A807zeAEAADJg+XnFiDUBAAAADxHZKQZXNpDzq1bpX//vfv4zp2lTz81A08AAJAyCDUtQKgJAAAAeC7DkP75x3U257p1rmNy55batpWefVaqX5+AEwCA5JaYfM0rlWoCAAAAALdls0mFCpmh5bhxUqNG5vUMGcyfmTNL589LM2dKTZpIBQtKr7wirV9vbh4EAABSF6EmAAAAAMQRd1OgGzfMn1evSl26SD17mjupnz0rffSRGX4WLCj17Stt2mRuVAQAAFIeoSYAAAAA/L+4gaZ9U6A33zTP586VgoLM3psrV0ovvijlyCGdOSN9+KG5JL1QIem116RffyXgBAAgJdFTM5nQUxMAAADwfCEh5i7od9rlPDTUXGoeEuK8duOGtGaNtGCBtGiRuQGRXcGCUrt2Uvv2Uq1a5hJ3AABwd2wUZAFCTQAAACB9i46WVq+W5s+XfvxRiopy3nvoIWfAWaMGAScAAHdCqGkBQk0AAAAAdtevSz//bM7g/PFH6fJl573Chc1w89lnpapVCTgBALAj1LQAoSYAAACAO7l2zezBOX++tGSJdOWK816xYmbA2b69VLkyAScAIH0j1LQAoSYAAACA+7l6VVq+3JzBuXSpeW5XooRzBmeFCgScAID0h1DTAoSaAAAAABLjyhXpp5/MgHPZMnNGp12pUs4ZnOXKEXACANIHQk0LEGoCAAAASKrLl82ZmwsWmEFndLTzXpkyzoCzbFnragQAIKURalqAUBMAAABAcoiKMgPO+fOlFSukGzec98qXdwacpUpZVyMAACmBUNMChJoAAAAAkltkpLR4sTmDc+VK6eZN572KFZ0B58MPW1cjAADJhVDTAoSaAAAAAFJSRIT044/mDM5Vq6Rbt5z3qlQxw8127aTixS0rEQCAB0KoaQFCTQAAAACp5eJF6YcfzBmcq1dLMTHOe9WqOQPOokUtKxEAgEQj1LQAoSYAAAAAK5w/bwac8+dLa9dKsbHOe4884gw4H3rIshIBAEgQQk0LEGoCAAAAsNq5c9L335szONevdw04a9VyBpyFCllWIgAAd0WoaQFCTQAAAADu5MwZ6bvvzIBz40Yp7v/yq1PHDDifeUYKDLSuRgAA4iLUtAChJgAAAAB39e+/zoDzl1+cAafNJtWt6ww48+e3tk4AQPpGqGkBQk0AAAAAnuCff8yAc/58afNm53WbTWrQwAw427aV8uWzrkYAQPpEqGkBQk0AAAAAnubUKenbb80ZnFu2OK97eUkNGzoDzjx5LCsRAJCOEGpagFATAAAAgCf76y8z4Jw/X9q+3Xnd21tq3NgMOJ96SsqVy7oaAQBpG6GmBQg1AQAAAKQVx49LCxeaMzh37HBe9/aWmjZ1Bpw5clhXIwAg7SHUtAChJgAAAIC06OhRM9xcsEDatct5PUMG6bHHzIDzySel7NmtqhAAkFYQalqAUBMAAABAWnfokHMG5+7dzusZMkjBwWbA2bq1FBBgXY0AAM9FqGkBQk0AAAAA6cmBA84ZnPv2Oa/7+kotWpgB5xNPSNmyWVcjAMCzEGpagFATAAAAQHq1b585g3P+fDPstPPzk1q2NAPOxx+Xsma1rkYAgPsj1LQAoSYAAACA9M4wzIBzwQIz4Dx0yHkvUyapVSsz4GzZUsqSxbo6AQDuiVDTAoSaAAAAAOBkGGbfTXvAefSo817mzObMzfbtzaXqmTNbVycAwH0QalqAUBMAAAAA7swwzJ3T7QHn8ePOe1mymJsLtW8vNW8uZcxoWZkAAIsRalqAUBMAAAAA7s8wpB07nJsM/fWX8162bM6AMzjY7MkJAEg/CDUtQKgJAAAAAIljGNL27ebszYULpVOnnPf8/aU2bcyA87HHzF3VAQBpW2LyNa9UqilJQkJCZLPZXB6lS5d23L9+/br69OmjXLlyKWvWrHr66ad15swZl9c4efKkWrVqpcyZMytv3rwaMmSIbt265TJm/fr1qlq1qvz8/FSiRAnNmTMnNT4eAAAAAKRrNpv0yCPS5MnSiRPS5s1S//5SwYJSVJT0+edm7818+aRu3aTly6WbN62uGgDgDtw61JSkcuXK6d9//3U8fvnlF8e9AQMGaMmSJVq4cKE2bNig06dPq23bto77MTExatWqlW7cuKHNmzdr7ty5mjNnjkaOHOkYc/z4cbVq1UqNGjXSrl271L9/f/Xo0UMrV65M1c8JAAAAAOmZl5dUu7b03nvSyZPSL79Ir74qFSggRURIc+aYu6bnyyf16CH9/DMBJwCkZ269/DwkJEQ//PCDdu3aFe9eZGSk8uTJo3nz5umZZ56RJB04cEBlypRRWFiYatWqpeXLl+vxxx/X6dOnlS9fPknSjBkzNHToUJ07d06+vr4aOnSoli1bpr179zpeu0OHDoqIiNCKFSsSXCvLzwEAAAAg+cXESL/+avbf/PZbKe7ivFy5pLZtzSXqDRtKPj6WlQkASAZpZvm5JB0+fFiBgYEqVqyYnn/+eZ08eVKStGPHDt28eVNNmzZ1jC1durQeeughhYWFSZLCwsJUoUIFR6ApScHBwYqKitK+ffscY+K+hn2M/TXuJjo6WlFRUS4PAAAAAEDy8vaW6teXpk2T/vlHWrdO6t1bypNHunBBmjXL7LkZGCi9/LJ5PybG6qoBACnNrUPNmjVras6cOVqxYoU++ugjHT9+XPXq1dOlS5cUHh4uX19fZc+e3eU5+fLlU3h4uCQpPDzcJdC037ffu9eYqKgoXbt27a61jR07VgEBAY5HUFDQg35cAAAAAMA9eHubMzI/+kg6fVpavVrq1cucsXnunDRjhtS4sdmTs08facMGAk4ASKvcOtRs0aKF2rVrp4oVKyo4OFg//fSTIiIitGDBAqtL0/DhwxUZGel4nIq7TR8AAAAAIEX5+EhNmkgffyz9+6/ZY7N7dylHDnOJ+vTpZgAaFCT162f26IyNtbpqAEBycetQ83bZs2dXyZIldeTIEeXPn183btxQRESEy5gzZ84of/78kqT8+fPH2w3dfn6/Mf7+/sqUKdNda/Hz85O/v7/LAwAAAACQ+jJkMJegf/KJGWguX27ulp49uxl4fvCBVK+eGXD272/usk7ACQCezaNCzcuXL+vo0aMqUKCAqlWrpgwZMmjNmjWO+wcPHtTJkydVu3ZtSVLt2rW1Z88enT171jFm1apV8vf3V9myZR1j4r6GfYz9NQAAAAAAniNDBql5c2n2bDPgXLZM6txZ8vc3l6xPmSLVqSMVKSINGiRt3Sq57/a5AIC7cevdzwcPHqwnnnhChQsX1unTpzVq1Cjt2rVLf/75p/LkyaOXX35ZP/30k+bMmSN/f3+9+uqrkqTNmzdLkmJiYlS5cmUFBgZqwoQJCg8PV6dOndSjRw+98847kqTjx4+rfPny6tOnj1588UWtXbtW/fr107JlyxQcHJzgWtn9HAAAAADcV3S0uUR9wQLpxx+lS5ec9woXltq1M3dRr15dstmsqxMA0rPE5GtuHWp26NBBGzdu1IULF5QnTx7VrVtXb7/9tooXLy5Jun79ugYNGqSvv/5a0dHRCg4O1vTp0x1LyyXpr7/+0ssvv6z169crS5Ys6tKli8aNGycfHx/HmPXr12vAgAH6888/VahQIb355pvq2rVromol1AQAAAAAz3D9urRypTR/vrR4sXTlivNe0aJmuNm+vVSlCgEnAKSmNBNqehJCTQAAAADwPNeumT04FyyQliyRrl513ite3BlwVqpEwAkAKY1Q0wKEmgAAAADg2a5eNXtwLlhg/rx2zXmvZElnwFm+vDPgDAmRvL2lN9+M/3qhoVJMjDkGAHB/icnXPGqjIAAAAAAAUkrmzGZvzYULpbNnpW++kdq2lTJmlA4dkt56S6pYUSpbVho1Stq3zww0R440A8y4QkPN697e1nwWAEjrmKmZTJipCQAAAABp06VL0tKl5gzO5cvNTYfsypWT8uaV1q2TxowxZ2zaA037OQAgYVh+bgFCTQAAAABI+6KizM2FFiyQVqyQbt50ve/tbS45Hz5ceucda2oEAE9FqGkBQk0AAAAASF8iIsyAc/586eefpVu3XO+XKiXVru18lC3LcnQAuBdCTQsQagIAAABA+jVihDR2rOTlJcXG3nlMtmxSzZrSo4+aIWfNmlKOHKlbJwC4MzYKAgAAAAAglYSGmoHmmDHm0vMxY8zrzz8vvf661LixlDWr2Ztz9WrzfosWUs6c5uzN7t2lTz4xNx66WyAKAHDlY3UBAAAAAAB4qjttCmT/ab++Zo0Zdu7dK4WFSZs3mz+PHJH27zcfs2ebzwkIkGrVci5Zr1nTvAYAcMXy82TC8nMAAAAASH9CQsw+mXfa5Tw01AwzQ0Lu/Nxz56QtW8yAMyxM2rZNunrVdYzNZs7mjNubs1Qpc5k7AKQ19NS0AKEmAAAAAOBB3Lol7dnjOpvz2LH443LkcJ3N+cgjEv8zFEBaQKhpAUJNAAAAAEByO3PGdTbn9u3StWuuY2w2qXx519mcJUua1wHAkxBqWoBQEwAAAADwf+3df2yV9b3A8U9p+VGxdILFUZCCoAURFIR5gc1gpsuIcjUzbiPsjsmWzYhzzMxcMAGZiLosMyxuY7o4NCIzxglsGkNkOpiKE9QqDQzB34gGxgQKClxOz/3jpLanLWAV+pynvF7JybGnz8HPSR4bfPf7PN/j7f/+L+LVVxtXcq5ZE/HWWy2P69kzP3J+6Uu5zYoACpmomQBREwAAgCR88EFj4FyzJmLduoj9+/OP6dQpYvjw/NA5eLDVnEBhETUTIGoCAABQCA4ejHjllfzVnO+80/K4U0/Nj5xjxkR0797+8wI0EDUTIGoCAABQqLZta7ma8+DB/GOKiyNGjGiMnOPGRQwcaDUn0H5EzQSImgAAAKTFgQMRL7+cHzq3bm15XO/e+as5R4+OOOmk9p8XODGImgkQNQEAAEizrVvzI+eLL+Y2JmqqpCTi3HPzV3NWVVnNCRwbomYCRE0AAAA6kv37I156KT90btvW8rgvfjF/Nef550eUlrb/vED6iZoJEDUBAADoyLLZiHffbQyczz2Xu4T90KH84zp3jjjvvMaVnGPHRpx+utWcwNGJmgkQNQEAADjRfPxx7jL1pqs5P/ig5XGVlfmrOUeNiujWrf3nBQqbqJkAURMAAIATXTYb8fbbjSs516yJqKmJyGTyj+vSJWLkyMaVnGPHRvTrl8jIQAERNRMgagIAAEBLH30UsW5d/mrO7dtbHtevX/5qzpEjI7p2bf95geSImgkQNQEAAODostmIN97Ij5yvvtpyNWfXrrnL1Juu5qysTGZmoH2ImgkQNQEAAOCz2bcvYu3a/ND573+3PK5///zVnOedl7uUHegYRM0EiJoAAABwbGSzEVu25EfO9esj6uvzj+vWLWL06PzQ+cUvJjMz8PmJmgkQNQEAAOD4qatruZrzP/9pedyAAfmR89xzIzp3bvdxgc9A1EyAqAkAAADtJ5uNeO21/MhZW5t7vanS0ogxY/JDZ+/eycwMHJmomQBREwAAAJK1Z0/ECy/kAudzz0U8/3zErl0tjzvjjPzIOWJERElJu48LNCNqJkDUBAAAgMJSXx+xaVP+as4NG1qu5jzppIgvfSk/dJ56ajIzw4lM1EyAqAkAAACFb9eu3GrO557LRc5//jNi9+6Wxw0enB85zznHak443kTNBIiaAAAAkD719REbN+av5ty4seVxJ5+cv5rzv/4rolev9p8XOjJRMwGiJgAAAHQMH36YW8HZdDVnXV3L4846Kxc4x43LPZ99dkRxcfvPCx2FqJkAURMAAAA6pkwmdy/Opqs5N21qeVxZWcQFF+Sv5jzllPxj5s7Nhc/Zs1u+f9683L9r7tzj8Smg8LWlr7kbBAAAAMARFBdHDB+ee/zwh7nXdu7M7a7eEDlfeCG3mnPlytyjwZAh+as5O3WKmDMn972mYXPevNzrt9zSfp8L0sxKzWPESk0AAAA4cWUyEbW1+as5N29ueVx5eW5n9ddfj/if/4m4886IhQsbg2ZrKzjhROHy8wSImgAAAEBTO3a0XM350UetH1tVFfH1r+fuyzl0aO65sjKiqKh9Z4YkiZoJEDUBAACAIzl0KGL9+sbIuXjxkY/v0SMXOBsiZ8PzgAG5y9iho3FPTQAAAIACU1ISMXJk7rFzZ+61Ll0iDh6MuPLKiOrq3IZEGzdGbNkSsWdPbuf1f/4z/8/p1i13r87msXPw4IjOndv/c0ESRE0AAACAdtR0U6DZs/O/Xro0d8yBA7mw2RA5G543bYrYvz+ipib3aKqkJOLMM1vGzurqiNLS9v6UcHyJmgAAAADtpHnQjGh8broreteuEcOG5R5NZTIRb77ZMnZu3Bixd2/jPz/6aON7iooiBg5sGTuHDs1d4g5pJGoCAAAAtJNMpvVdzhu+zmSO/P7i4txl5oMHR/z3fze+ns1GbN3aMnZu2BDxn/9EvPFG7vH44/l/Xt++rcfOiorP/1nheLJR0DFioyAAAACg0GSzuV3YW4ud779/+PedemrrsbNvXzuyc/zY/TwBoiYAAACQJrt2NV6u3jR2vvXW4d9TVtZ67BwwILeKFD4PUTMBoiYAAADQEezbl9uQqHns3LLl8JfHd+uW25CoeewcPDi3wzt8GqJmAkRNAAAAoCM7eDBi8+aWsXPTptxu7a0pKcmFzeaxs7o64qST2nd+Cl9b+pqNggAAAAA4qi5djrwje/PY2bAj+7/+lXs0VVSUu2S9eewcOjSivLzdPhIpZqXmMWKlJgAAAECjhh3Zm8fOhh3ZD6eysmXsPPtsO7KfCFx+ngBREwAAAODoGnZkb21l57Zth39fr16tx047snccomYCRE0AAACAz2fXrtyl6s1j55tvHv49DTuyN4+ddmRPH1EzAaImAAAAwPHx0Ue5DYmax87Nm4++I3vz2GlH9sIlaiZA1AQAAABoXwcPRmzZ0jJ2/utfR9+RvXnstCN78kTNBIiaAAAAAIUhk4l4662WsXPDhtyO7K1p2JG9eey0I3v7ETUTIGoCAAAAFLZsNuK991qPnTt3Hv59lZWtx86KCpsUHUuiZgJETQAAAID02rGj9dh5tB3ZW4ud/fqJnZ+FqJkAURMAAACg49m9Oxc4m8fOt97KrfxsTVlZxJAhLWPnwIF2ZD8SUTMBoiYAAADAiaNhR/bmsXPLlohDh1p/T9euuQ2JmsfOM888/I7sc+fmQujs2S2/N29e7v6hc+ceq0+VrLb0tZJ2mgkAAAAAOoyTTooYOTL3aKphR/bmsXPTpoj9+yNefTX3aKq4OLcje/PYOWRI7ntz5uSOaxo2583LvX7LLcf3cxYqKzWPESs1AQAAADichh3Zm8fOjRsj6upaf09RUURVVUTnzhGbN0dccUXE//5vxJNPNgbN1lZwppXLzxMgagIAAADQVg07sjePnUfbkb2jBc0IUTMRoiYAAAAAx9KOHfmx8667chG0S5eIAweSnu7Ya0tf69ROMwEAAAAAbVBREXHhhRHXXBNx6qmNQfPgwdw9NU9koiYAAAAAFLCmmwIdOJB7njPnxA6bdj8HAAAAgALVNGg23EOz4bm1XdFPFKImAAAAABSoTKb1TYEavs5k2n+mQmCjoGPERkEAAAAA8NnZKAgAAAAA6LBETQAAAAAgVURNAAAAACBVRE0AAAAAIFVETQAAAAAgVURNAAAAACBVRE0AAAAAIFVETQAAAAAgVURNAAAAACBVRE0AAAAAIFVETQAAAAAgVURNAAAAACBVRE0AAAAAIFVETQAAAAAgVURNAAAAACBVRE0AAAAAIFVETQAAAAAgVURNAAAAACBVRE0AAAAAIFVETQAAAAAgVURNAAAAACBVSpIeoKPIZrMREbFnz56EJwEAAACA9Gnoag2d7UhEzWOkrq4uIiJOP/30hCcBAAAAgPSqq6uL8vLyIx5TlP006ZOjqq+vj23btkVZWVkUFRUlPc5xsWfPnjj99NPj3XffjR49eiQ9DgXO+UJbOWdoK+cMbeWcoa2cM7SVc4a2cs7QVh39nMlms1FXVxeVlZXRqdOR75pppeYx0qlTp+jXr1/SY7SLHj16dMj/cDg+nC+0lXOGtnLO0FbOGdrKOUNbOWdoK+cMbdWRz5mjrdBsYKMgAAAAACBVRE0AAAAAIFVETT61rl27xs033xxdu3ZNehRSwPlCWzlnaCvnDG3lnKGtnDO0lXOGtnLO0FbOmUY2CgIAAAAAUsVKTQAAAAAgVURNAAAAACBVRE0AAAAAIFVETQAAAAAgVURNjmr16tUxadKkqKysjKKioli2bFnSI1HAbr/99hgzZkyUlZVF796944orrohNmzYlPRYFbOHChTFixIjo0aNH9OjRI8aOHRtPPPFE0mORInfccUcUFRXFjBkzkh6FAjV37twoKirKewwZMiTpsShw7733XnznO9+JXr16RWlpaQwfPjzWrVuX9FgUqAEDBrT4OVNUVBTTp09PejQKUCaTidmzZ8fAgQOjtLQ0Bg0aFPPmzQv7OHMkdXV1MWPGjKiqqorS0tIYN25crF27NumxElWS9AAUvn379sW5554b06ZNi2984xtJj0OBW7VqVUyfPj3GjBkThw4diptuuim+9rWvxYYNG6J79+5Jj0cB6tevX9xxxx1x5plnRjabjfvvvz8uv/zyePnll2PYsGFJj0eBW7t2bdx9990xYsSIpEehwA0bNixWrlz5ydclJf4azOF9+OGHMX78+LjoooviiSeeiIqKiti8eXOccsopSY9GgVq7dm1kMplPvq6trY1LLrkkrrrqqgSnolD94he/iIULF8b9998fw4YNi3Xr1sXVV18d5eXlcf311yc9HgXqBz/4QdTW1sYDDzwQlZWVsXjx4rj44otjw4YN0bdv36THS0RR1q8CaIOioqJYunRpXHHFFUmPQkrs2LEjevfuHatWrYoLL7ww6XFIiZ49e8Yvf/nL+P73v5/0KBSwvXv3xqhRo+J3v/td3HrrrXHeeefFggULkh6LAjR37txYtmxZ1NTUJD0KKTFz5sx49tln4x//+EfSo5BSM2bMiMceeyw2b94cRUVFSY9DgbnsssvitNNOi3vvvfeT16688sooLS2NxYsXJzgZherjjz+OsrKyWL58eVx66aWfvH7++efHxIkT49Zbb01wuuS4/Bw4rnbv3h0RuUgFR5PJZOKhhx6Kffv2xdixY5MehwI3ffr0uPTSS+Piiy9OehRSYPPmzVFZWRlnnHFGTJkyJd55552kR6KA/eUvf4nRo0fHVVddFb17946RI0fGH/7wh6THIiUOHjwYixcvjmnTpgmatGrcuHHxt7/9LV577bWIiHjllVfimWeeiYkTJyY8GYXq0KFDkclkolu3bnmvl5aWxjPPPJPQVMlz3Q1w3NTX18eMGTNi/Pjxcc455yQ9DgVs/fr1MXbs2Ni/f3+cfPLJsXTp0jj77LOTHosC9tBDD8VLL710wt9HiE/nggsuiPvuuy+qq6vj/fffj5///Ofxla98JWpra6OsrCzp8ShAb7zxRixcuDBuuOGGuOmmm2Lt2rVx/fXXR5cuXWLq1KlJj0eBW7ZsWezatSu+973vJT0KBWrmzJmxZ8+eGDJkSBQXF0cmk4n58+fHlClTkh6NAlVWVhZjx46NefPmxdChQ+O0006LP/3pT7FmzZoYPHhw0uMlRtQEjpvp06dHbW3tCf2bIz6d6urqqKmpid27d8cjjzwSU6dOjVWrVgmbtOrdd9+Nn/zkJ/Hkk0+2+G01tKbpypcRI0bEBRdcEFVVVfHwww+7zQWtqq+vj9GjR8dtt90WEREjR46M2tra+P3vfy9qclT33ntvTJw4MSorK5MehQL18MMPx4MPPhhLliyJYcOGRU1NTcyYMSMqKyv9jOGwHnjggZg2bVr07ds3iouLY9SoUTF58uR48cUXkx4tMaImcFxcd9118dhjj8Xq1aujX79+SY9DgevSpcsnv2E8//zzY+3atfHrX/867r777oQnoxC9+OKLsX379hg1atQnr2UymVi9enX85je/iQMHDkRxcXGCE1LovvCFL8RZZ50VW7ZsSXoUClSfPn1a/GJt6NCh8ec//zmhiUiLt99+O1auXBmPPvpo0qNQwG688caYOXNmfPvb346IiOHDh8fbb78dt99+u6jJYQ0aNChWrVoV+/btiz179kSfPn3iW9/6VpxxxhlJj5YY99QEjqlsNhvXXXddLF26NJ566qkYOHBg0iORQvX19XHgwIGkx6BAffWrX43169dHTU3NJ4/Ro0fHlClToqamRtDkqPbu3Ruvv/569OnTJ+lRKFDjx4+PTZs25b322muvRVVVVUITkRaLFi2K3r17523kAc199NFH0alTfo4pLi6O+vr6hCYiTbp37x59+vSJDz/8MFasWBGXX3550iMlxkpNjmrv3r15KxnefPPNqKmpiZ49e0b//v0TnIxCNH369FiyZEksX748ysrK4oMPPoiIiPLy8igtLU14OgrRrFmzYuLEidG/f/+oq6uLJUuWxN///vdYsWJF0qNRoMrKylrcp7d79+7Rq1cv9++lVT/72c9i0qRJUVVVFdu2bYubb745iouLY/LkyUmPRoH66U9/GuPGjYvbbrstvvnNb8YLL7wQ99xzT9xzzz1Jj0YBq6+vj0WLFsXUqVOjpMT/anN4kyZNivnz50f//v1j2LBh8fLLL8edd94Z06ZNS3o0CtiKFSsim81GdXV1bNmyJW688cYYMmRIXH311UmPlhg/aTmqdevWxUUXXfTJ1zfccENEREydOjXuu+++hKaiUC1cuDAiIiZMmJD3+qJFi9wsnVZt3749vvvd78b7778f5eXlMWLEiFixYkVccsklSY8GdBBbt26NyZMnx86dO6OioiK+/OUvx/PPPx8VFRVJj0aBGjNmTCxdujRmzZoVt9xySwwcODAWLFhgEw+OaOXKlfHOO+8IUxzVXXfdFbNnz45rr702tm/fHpWVlfGjH/0o5syZk/RoFLDdu3fHrFmzYuvWrdGzZ8+48sorY/78+dG5c+ekR0tMUTabzSY9BAAAAADAp+WemgAAAABAqoiaAAAAAECqiJoAAAAAQKqImgAAAABAqoiaAAAAAECqiJoAAAAAQKqImgAAAABAqoiaAAAAAECqiJoAAHRoEyZMiBkzZiQ9BgAAx5CoCQAAAACkiqgJAAAAAKSKqAkAwAnl8ccfj/Ly8njwwQeTHgUAgM+oJOkBAACgvSxZsiSuueaaWLJkSVx22WVJjwMAwGdkpSYAACeE3/72t3HttdfGX//6V0ETACDlrNQEAKDDe+SRR2L79u3x7LPPxpgxY5IeBwCAz8lKTQAAOryRI0dGRUVF/PGPf4xsNpv0OAAAfE6iJgAAHd6gQYPi6aefjuXLl8ePf/zjpMcBAOBzcvk5AAAnhLPOOiuefvrpmDBhQpSUlMSCBQuSHgkAgM9I1AQA4IRRXV0dTz31VEyYMCGKi4vjV7/6VdIjAQDwGRRl3VQIAAAAAEgR99QEAAAAAFJF1AQAAAAAUkXUBAAAAABSRdQEAAAAAFJF1AQAAAAAUkXUBAAAAABSRdQEAAAAAFJF1AQAAAAAUkXUBAAAAABSRdQEAAAAAFJF1AQAAAAAUuX/AUy/fimAZFjMAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "distortions = []\n", + "K = range(1, 10) # Adjust the range as needed\n", + "for k in K:\n", + " kmeanModel = KMeans(n_clusters=k)\n", + " kmeanModel.fit(pca_features)\n", + " distortions.append(kmeanModel.inertia_)\n", + "\n", + "# Plot the Elbow\n", + "plt.figure(figsize=(16, 8))\n", + "plt.plot(K, distortions, \"bx-\")\n", + "plt.xlabel(\"k\")\n", + "plt.ylabel(\"Distortion\")\n", + "plt.title(\"The Elbow Method showing the optimal k\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "fb0c72ce-cbd6-4e64-b676-1c9fa30d834c", + "metadata": { + "tags": [] + }, + "source": [ + "## Choose the right number of clusters based on the Elbow method" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "179a1ef1-0706-4562-9121-1be8ea68ee75", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAj4AAAHHCAYAAAC/R1LgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAABSfUlEQVR4nO3dd1gUV9sG8HsW2AWBXVApomAXRSwRG/ZCxMTeSxJFjX5RNLEkUVNseQ2Jmpho1GgSSxIVSyyvpijBFiMaxd6wC4pgpUhn93x/8LJxqassLDD377r2Spg5M/vsYd29mTlzRhJCCBARERHJgMLcBRARERGVFAYfIiIikg0GHyIiIpINBh8iIiKSDQYfIiIikg0GHyIiIpINBh8iIiKSDQYfIiIikg0GHyIiIpINBh8ymxo1aiAgIMDcZZRKc+bMgSRJ5i4jzzrM8Xtbu3YtJEnCrVu3CmxXWvqtrJMkCXPmzDF3GfkKCAhAjRo1zF0GlVEMPmRy169fx//93/+hVq1asLa2hlqtRtu2bfH1118jJSWlRGpITk7GnDlzcODAgRJ5Piqfbt26BUmS9A8LCwt4eHigX79+OH36dK72qampWLx4MVq1agWNRgNra2vUq1cPEydOxJUrV/J8jvfffx+SJGHIkCHF/GqICAAszV0AlS+//vorBg0aBJVKhREjRsDb2xvp6ek4fPgw3nvvPVy4cAGrVq0q9jqSk5Mxd+5cAECnTp2K/flM7aOPPsKMGTPMXUaeIiIioFCUzr+Ziqvfhg0bhldffRVarRaXLl3CihUr8Pvvv+Po0aNo2rQpAODhw4fo3r07wsPD0bNnTwwfPhx2dnaIiIhAcHAwVq1ahfT0dIP9CiGwceNG1KhRA7t27UJiYiLs7e1NXj8R/YvBh0zm5s2bGDp0KKpXr459+/ahSpUq+nWBgYG4du0afv31VzNWWHRJSUmwtbUt9uextLSEpWXp/OepUqnMXUK+iqvfmjVrhtdff13/c9u2bdG7d2+sWLECK1euBJB1+uXUqVPYunUrBgwYYLD9J598gg8//DDXfg8cOIA7d+5g37598Pf3x7Zt2zBy5MgXrjM5ORkVKlR44e2J5KB0/tlGZdKCBQvw9OlT/PDDDwahJ1udOnXwzjvv5Lt9fuMz8hrfceLECfj7+6Ny5cqwsbFBzZo1MXr0aABZpyecnJwAAHPnztWfpnh2zMLly5cxcOBAVKxYEdbW1mjevDn++9//5vm8Bw8exIQJE+Ds7Ixq1aoBABITEzF58mTUqFEDKpUKzs7OePnll3Hy5Ml8X9/WrVv1+8tp5cqVkCQJ58+fz7cvQkJC0K5dOzg4OMDOzg6enp744IMPCuwnIOvLVZIkg9N+f/31FwYNGgQPDw+oVCq4u7tjypQpRp2KzDnG59lTQTkfz9ZiTJ8DwIULF9ClSxfY2NigWrVq+M9//gOdTldoXUDe/SZJEiZOnIgdO3bA29sbKpUKDRs2xB9//GHUPvPSpUsXAFlhHwCOHTuGX3/9FWPGjMkVeoCssLho0aJcy9evXw8vLy907twZfn5+WL9+vdE1dOrUCd7e3ggPD0eHDh1QoUIF/fshLS0Ns2fPRp06dfS/3/fffx9paWkG+0hLS8OUKVPg5OQEe3t79O7dG3fu3Mn1XPmNqcnv3+zPP/+Mli1bokKFCnB0dESHDh2wd+9egza///472rdvD1tbW9jb26NHjx64cOFCrn1l/96sra3h7e2N7du3G91HBX1OAP+eyly0aBEWL16M6tWrw8bGBh07dtT/W8x29uxZBAQE6E/hu7q6YvTo0Xj06FGu57179y7GjBkDNzc3qFQq1KxZE+PHjzc44hcXF4fJkyfD3d0dKpUKderUweeff270e51eXOn8k5LKpF27dqFWrVpo06ZNsT7P/fv30a1bNzg5OWHGjBlwcHDArVu3sG3bNgCAk5MTVqxYgfHjx6Nfv37o378/AKBx48YAsr5Y27Zti6pVq2LGjBmwtbXF5s2b0bdvX/zyyy/o16+fwfNNmDABTk5OmDVrFpKSkgAAb731FrZu3YqJEyfCy8sLjx49wuHDh3Hp0iU0a9Ysz7p79OgBOzs7bN68GR07djRYt2nTJjRs2BDe3t55bnvhwgX07NkTjRs3xrx586BSqXDt2jX8/fffL9SHW7ZsQXJyMsaPH49KlSrhn3/+wdKlS3Hnzh1s2bLlufb1008/5Vr20Ucf4f79+7Czs9PXb0yfx8TEoHPnzsjMzNS3W7VqFWxsbF7odWY7fPgwtm3bhgkTJsDe3h5LlizBgAEDEBkZiUqVKj33/q5fvw4A+m2zA9wbb7xh9D7S0tLwyy+/YNq0aQCyTqeNGjUKMTExcHV1NWofjx49wiuvvIKhQ4fi9ddfh4uLC3Q6HXr37o3Dhw9j3LhxaNCgAc6dO4fFixfjypUr2LFjh377N998Ez///DOGDx+ONm3aYN++fejRo4fRryEvc+fOxZw5c9CmTRvMmzcPSqUSx44dw759+9CtWzcAWe+ZkSNHwt/fH59//jmSk5OxYsUKtGvXDqdOndKHrL1792LAgAHw8vJCUFAQHj16hFGjRun/AClIYZ8Tz/rxxx+RmJiIwMBApKam4uuvv0aXLl1w7tw5uLi4AMj6w+PGjRsYNWoUXF1d9aftL1y4gKNHj+oDYHR0NFq2bIm4uDiMGzcO9evXx927d7F161YkJydDqVQiOTkZHTt2xN27d/F///d/8PDwwJEjRzBz5kzcu3cPX331VZF+B1QIQWQC8fHxAoDo06eP0dtUr15djBw5Uv/z7NmzRV5vyTVr1ggA4ubNm0IIIbZv3y4AiOPHj+e77wcPHggAYvbs2bnWde3aVTRq1Eikpqbql+l0OtGmTRtRt27dXM/brl07kZmZabAPjUYjAgMDjXyl/xo2bJhwdnY22N+9e/eEQqEQ8+bN0y/L2ReLFy8WAMSDBw/y3XfOfsq2f/9+AUDs379fvyw5OTnX9kFBQUKSJHH79u186xAi9+8tpwULFggA4scff9QvM7bPJ0+eLACIY8eO6Zfdv39faDSaPF9bTnnVC0AolUpx7do1/bIzZ84IAGLp0qUF7u/mzZsCgJg7d6548OCBiImJEQcOHBAvvfSSACB++eUXIYQQ/fr1EwDEkydPCtzfs7Zu3SoAiKtXrwohhEhISBDW1tZi8eLFRm3fsWNHAUB8++23Bst/+uknoVAoxF9//WWw/NtvvxUAxN9//y2EEOL06dMCgJgwYYJBu+HDh+f6tzNy5EhRvXr1XDXk7O+rV68KhUIh+vXrJ7RarUFbnU4nhBAiMTFRODg4iLFjxxqsj4mJERqNxmB506ZNRZUqVURcXJx+2d69ewWAPOt5ljGfE9m/XxsbG3Hnzh398mPHjgkAYsqUKfplef2b2bhxowAgDh06pF82YsQIoVAo8nze7D745JNPhK2trbhy5YrB+hkzZggLCwsRGRlZ4GujouGpLjKJhIQEACiRgZkODg4AgN27dyMjI+O5tn38+DH27duHwYMHIzExEQ8fPsTDhw/x6NEj+Pv74+rVq7h7967BNmPHjoWFhUWuGo4dO4bo6Ojnev4hQ4bg/v37Bqedtm7dCp1OV+BVPdmveefOnSY5FP7sEZSkpCQ8fPgQbdq0gRACp06deuH97t+/HzNnzsSkSZP0Rz+ep89/++03tG7dGi1bttTv08nJCa+99toL1wQAfn5+qF27tv7nxo0bQ61W48aNG0ZtP3v2bDg5OcHV1RWdOnXC9evX8fnnn+uPJr7I+3/9+vVo3rw56tSpo9+2R48ez3W6S6VSYdSoUQbLtmzZggYNGqB+/fr6vn748KH+9Nz+/fsBZPU1ALz99tsG20+ePNno589px44d0Ol0mDVrVq4B8NlHREJCQhAXF4dhw4YZ1GdhYYFWrVrp67t37x5Onz6NkSNHQqPR6Pfz8ssvw8vLq9Banudzom/fvqhatar+55YtW6JVq1b6PgIM/82kpqbi4cOHaN26NQDoT3HrdDrs2LEDvXr1QvPmzXM9T3YfbNmyBe3bt4ejo6NBH/j5+UGr1eLQoUOFvj56cQw+ZBJqtRpA1tiX4taxY0cMGDAAc+fOReXKldGnTx+sWbMm1/iFvFy7dg1CCHz88cdwcnIyeMyePRtA1iHyZ9WsWTPXfhYsWIDz58/D3d0dLVu2xJw5c4z6Eu3evTs0Gg02bdqkX7Zp0yY0bdoU9erVy3e7IUOGoG3btnjzzTfh4uKCoUOHYvPmzS8cgiIjIxEQEICKFSvCzs4OTk5O+tNv8fHxL7TPO3fu6Ov88ssv9cufp89v376NunXr5tq3p6fnC9WUzcPDI9cyR0dHPHnyxKjtx40bh5CQEISGhiI8PBz379/H+++/r1//vO//uLg4/Pbbb+jYsSOuXbumf7Rt2xYnTpzQX/qekpKCmJgYg8ezqlatCqVSabDs6tWruHDhQq6+zn5/PdvXCoXCIBACRevr69evQ6FQFBhMrl69CiBrnFTOGvfu3WtQH4AXfj88z+dEXs9Rr149gzFqjx8/xjvvvAMXFxfY2NjAyclJ/9mQ/W/mwYMHSEhIyPeU9bN98Mcff+R6/X5+fgByfwaRaXGMD5mEWq2Gm5tbrgGBzyO/iee0Wm2udlu3bsXRo0exa9cu7NmzB6NHj8YXX3yBo0eP6seV5CU7KLz77rvw9/fPs032X+DZ8hpfMnjwYLRv3x7bt2/H3r17sXDhQnz++efYtm0bXnnllXyfX6VSoW/fvti+fTuWL1+O2NhY/P333/j000/z3Sa7hkOHDmH//v349ddf8ccff2DTpk3o0qUL9u7dCwsLC6P7T6vV4uWXX8bjx48xffp01K9fH7a2trh79y4CAgJeKEylp6dj4MCBUKlU2Lx5s8GVVS/S56aW84hdNiGEUdvXrVtX/6WUl/r16wMAzp07h/bt2xe6vy1btiAtLQ1ffPEFvvjii1zr169fj7lz52LTpk25jug8W3Ne702dTodGjRoZhM9nubu7F1pfTsa+t4yR/X746aef8hzLZKqr8oryOZGXwYMH48iRI3jvvffQtGlT2NnZQafToXv37s/9b0an0+Hll182CM/PKuiPICo6Bh8ymZ49e2LVqlUICwuDr6/vc2/v6OgIIOuv4ezD1MC/f/nl1Lp1a7Ru3Rrz58/Hhg0b8NprryE4OBhvvvlmvh/UtWrVAgBYWVkV+EVmjCpVqmDChAmYMGEC7t+/j2bNmmH+/PkFBh8g6+jNunXrEBoaikuXLkEIYdTkdQqFAl27dkXXrl3x5Zdf4tNPP8WHH36I/fv3w8/Pz6D/npWz/86dO4crV65g3bp1GDFihH55SEiIka88t7fffhunT5/GoUOH9INBsz1Pn1evXl1/ROBZERERL1xbSejVqxeCgoLw888/GxV81q9fD29vb/0Rr2etXLkSGzZswNy5c+Hv7//cv5fatWvjzJkz6Nq1a4GzWFevXh06nQ7Xr183OIKSV187Ojrmel8Bud9btWvXhk6nw8WLF/XzG+VVHwA4OzsX+H6oXr06ABT5/VDQ50S2vJ7jypUr+kHWT548QWhoKObOnYtZs2blu52TkxPUanWhfwDWrl0bT58+LfJnEL0Ynuoik3n//fdha2uLN998E7GxsbnWX79+HV9//XW+22d/ID57fjspKQnr1q0zaPfkyZNcf6lnf8hmH8bOnssk54e1s7MzOnXqhJUrV+LevXu5anjw4EG+9WXTarW5Tgc5OzvDzc3NqNNtfn5+qFixIjZt2oRNmzahZcuWeZ5Oe9bjx49zLcv5mvPqP61Wm2vCyOyjH8/2oRCiwN9NQdasWYOVK1di2bJlBmNzsj1Pn7/66qs4evQo/vnnH4P1zzPuxRx8fX3RvXt3fP/99wZXTWVLT0/Hu+++CwCIiorCoUOHMHjwYAwcODDXY9SoUbh27RqOHTuGKlWqwM/Pz+BRmMGDB+Pu3bv47rvvcq1LSUnRX5mYHdCXLFli0CavK4pq166N+Ph4nD17Vr/s3r17uS4t79u3LxQKBebNm5frKEj2+83f3x9qtRqffvppnmNvst8PVapUQdOmTbFu3TqDf28hISG4ePFivq8/mzGfE9l27NhhMLbvn3/+wbFjx/R9lNe/GSB3XykUCvTt2xe7du3CiRMnctWUvf3gwYMRFhaGPXv25GoTFxeHzMzMQl8fvTge8SGTqV27NjZs2IAhQ4agQYMGBjM3HzlyBFu2bCnwHk/dunWDh4cHxowZg/feew8WFhZYvXo1nJycEBkZqW+3bt06LF++HP369UPt2rWRmJiI7777Dmq1Gq+++iqArFMAXl5e2LRpE+rVq4eKFSvC29sb3t7eWLZsGdq1a4dGjRph7NixqFWrFmJjYxEWFoY7d+7gzJkzBb7OxMREVKtWDQMHDkSTJk1gZ2eHP//8E8ePH8/ztEVOVlZW6N+/P4KDg5GUlJTn/C45zZs3D4cOHUKPHj1QvXp13L9/H8uXL0e1atXQrl07AEDDhg3RunVrzJw5E48fP0bFihURHByc60O0fv36qF27Nt59913cvXsXarUav/zyi9HjXZ718OFDTJgwAV5eXlCpVPj5558N1vfr1w+2trZG9/n777+Pn376Cd27d8c777yjv5y9evXqBl+6pdGPP/6Ibt26oX///ujVqxe6du0KW1tbXL16FcHBwbh37x4WLVqEDRs2QAiB3r1757mfV199FZaWlli/fj1atWr13HW88cYb2Lx5M9566y3s378fbdu2hVarxeXLl7F582bs2bMHzZs3R9OmTTFs2DAsX74c8fHxaNOmDUJDQ3Ht2rVc+xw6dCimT5+Ofv364e2339Zffl6vXj2Duavq1KmDDz/8EJ988gnat2+P/v37Q6VS4fjx43Bzc0NQUBDUajVWrFiBN954A82aNcPQoUP1/8Z//fVXtG3bFt988w0AICgoCD169EC7du0wevRoPH78GEuXLkXDhg3x9OnTAvvBmM+JZ+tu164dxo8fj7S0NHz11VeoVKmS/lSUWq1Ghw4dsGDBAmRkZKBq1arYu3evfh6nZ3366afYu3cvOnbsqJ9O4N69e9iyZQsOHz4MBwcHvPfee/jvf/+Lnj17IiAgAD4+PkhKSsK5c+ewdetW3Lp1C5UrV37u3z0ZyRyXklH5duXKFTF27FhRo0YNoVQqhb29vWjbtq1YunSpweXMeV0WHR4eLlq1aiWUSqXw8PAQX375Za7LtE+ePCmGDRsmPDw8hEqlEs7OzqJnz57ixIkTBvs6cuSI8PHxEUqlMtfludevXxcjRowQrq6uwsrKSlStWlX07NlTbN26Vd8m+3lzXpaalpYm3nvvPdGkSRNhb28vbG1tRZMmTcTy5cuN7qOQkBABQEiSJKKionKtz3mZcGhoqOjTp49wc3MTSqVSuLm5iWHDhuW6HPb69evCz89PqFQq4eLiIj744AP9cz17OfvFixeFn5+fsLOzE5UrVxZjx47VX+K9Zs2afOsQwvD3ln05cH6PZy8/N6bPhRDi7NmzomPHjsLa2lpUrVpVfPLJJ+KHH34o0uXseU09UNhl+c++voULFxbYLltycrJYtGiRaNGihbCzsxNKpVLUrVtXTJo0SX85faNGjYSHh0eB++nUqZNwdnYWGRkZ+bbp2LGjaNiwYZ7r0tPTxeeffy4aNmwoVCqVcHR0FD4+PmLu3LkiPj5e3y4lJUW8/fbbolKlSsLW1lb06tVLREVF5TkVxN69e4W3t7dQKpXC09NT/Pzzz/lOQbF69Wrx0ksv6Z+7Y8eOIiQkxKDN/v37hb+/v9BoNMLa2lrUrl1bBAQE5Pp3/Msvv4gGDRoIlUolvLy8xLZt2/K9vP5ZxnxOPPv7/eKLL4S7u7tQqVSiffv24syZMwb7u3PnjujXr59wcHAQGo1GDBo0SERHR+fZV7dv3xYjRowQTk5OQqVSiVq1aonAwECRlpamb5OYmChmzpwp6tSpI5RKpahcubJo06aNWLRokUhPTy/wtVHRSEIYObqPiIioHLl16xZq1qyJhQsX6k9FUvnHMT5EREQkGww+REREJBsMPkRERCQbHONDREREssEjPkRERCQbDD5EREQkG5zAMAedTofo6GjY29sXON07ERERlR5CCCQmJsLNzQ0KRf7HdRh8coiOjn6hm/gRERGR+UVFRaFatWr5rmfwycHe3h5AVsep1WozV0NERETGSEhIgLu7u/57PD8MPjlkn95Sq9UMPkRERGVMYcNUOLiZiIiIZIPBh4iIiGSDwYeIiIhkg8GHiIiIZIPBh4iIiGSDwYeIiIhkg8GHiIiIZIPBh4iIiGSDwYeIiIhkgzM3E5HJCN1TIGUTRPJmQPcAUFSGZDMIqDAUkqLgaeSJiEoCgw8RmYTQPoJ4PAzQ3gYgshZqn0I8XQSkbAYqboBk4WTWGomIeKqLiExCJHwEaKOgDz3/rgG0dyDiP/x3iRAQ6acgkrdApOyG0MWXaK1EJF884kNERSa0d4G0fcgderJpgfSDEJmRgEiAiHsf0F57Zr0SwnYkJLupkCSLEqiYiOSKwYeIii7jHPIPPdkERGoIkLQEEGk51qUDSd9D6BIhaeYVU5FERDzVRUQmYeRHSdpeQKQD0OWxUgApwRCZt0xYFxGRIQYfIio6ZXMUfgBZAWScBaAtoI0FRMpO09VFRJQDgw8RFZmkqAjYDED+HykKQPUKCg49ACABukemLY6I6BkMPkRkEpL6Q0DZ+n8/WRj+V9kS0MwGYFXIXnSQLFyLp0AiInBwMxGZiCRZA44/AGmHIFJ+AXQxgMIVkk1/QNURkmQBnXUvIHUn8j/yIwCbviVYNRHJDYMPEZmMJFkA1p0hWXfOe73dRIi0fYBIRJ7hx3YcJAu34i2SiGSNp7qIqMRIltUgVdoEWDXPsUIDyX46JLup5imMiGSDR3yIqERJljUhVfoJIvM2kHkdkGwApQ8kSWnu0ohIBhh8iMgsJMvqgGV1c5dBRDLDU11EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBtlNvh89tlnkCQJkydP1i9LTU1FYGAgKlWqBDs7OwwYMACxsbHmK5KIiIhKlTIZfI4fP46VK1eicePGBsunTJmCXbt2YcuWLTh48CCio6PRv39/M1VJREREpU2ZCz5Pnz7Fa6+9hu+++w6Ojo765fHx8fjhhx/w5ZdfokuXLvDx8cGaNWtw5MgRHD161IwVExERUWlR5oJPYGAgevToAT8/P4Pl4eHhyMjIMFhev359eHh4ICwsrKTLJCIiolKoTN2kNDg4GCdPnsTx48dzrYuJiYFSqYSDg4PBchcXF8TExOS7z7S0NKSlpel/TkhIMFm9REREVLqUmSM+UVFReOedd7B+/XpYW1ubbL9BQUHQaDT6h7u7u8n2TURERKVLmQk+4eHhuH//Ppo1awZLS0tYWlri4MGDWLJkCSwtLeHi4oL09HTExcUZbBcbGwtXV9d89ztz5kzEx8frH1FRUcX8SoiIiMhcysyprq5du+LcuXMGy0aNGoX69etj+vTpcHd3h5WVFUJDQzFgwAAAQEREBCIjI+Hr65vvflUqFVQqVbHWTkRERKVDmQk+9vb28Pb2Nlhma2uLSpUq6ZePGTMGU6dORcWKFaFWqzFp0iT4+vqidevW5iiZiIiISpkyE3yMsXjxYigUCgwYMABpaWnw9/fH8uXLzV0WERERlRKSEEKYu4jSJCEhARqNBvHx8VCr1eYuh4iIiIxg7Pd3mRncTERERFRUDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBtlJvisWLECjRs3hlqthlqthq+vL37//Xf9+tTUVAQGBqJSpUqws7PDgAEDEBsba8aKiYiIqLQpM8GnWrVq+OyzzxAeHo4TJ06gS5cu6NOnDy5cuAAAmDJlCnbt2oUtW7bg4MGDiI6ORv/+/c1cNREREZUmkhBCmLuIF1WxYkUsXLgQAwcOhJOTEzZs2ICBAwcCAC5fvowGDRogLCwMrVu3NnqfCQkJ0Gg0iI+Ph1qtLq7SiYiIyISM/f4uM0d8nqXVahEcHIykpCT4+voiPDwcGRkZ8PPz07epX78+PDw8EBYWZsZKiYiIqDSxNHcBz+PcuXPw9fVFamoq7OzssH37dnh5eeH06dNQKpVwcHAwaO/i4oKYmJgC95mWloa0tDT9zwkJCcVROhEREZUCZeqIj6enJ06fPo1jx45h/PjxGDlyJC5evFikfQYFBUGj0egf7u7uJqqWiIiISpsyFXyUSiXq1KkDHx8fBAUFoUmTJvj666/h6uqK9PR0xMXFGbSPjY2Fq6trgfucOXMm4uPj9Y+oqKhifAVERERkTmUq+OSk0+mQlpYGHx8fWFlZITQ0VL8uIiICkZGR8PX1LXAfKpVKf4l89oOIiIjKpzIzxmfmzJl45ZVX4OHhgcTERGzYsAEHDhzAnj17oNFoMGbMGEydOhUVK1aEWq3GpEmT4Ovr+1xXdBEREVH5VmaCz/379zFixAjcu3cPGo0GjRs3xp49e/Dyyy8DABYvXgyFQoEBAwYgLS0N/v7+WL58uZmrJiIiotKkTM/jUxw4jw8REVHZU67n8SEiIiJ6EQw+REREJBsMPkRERCQbDD5EREQkG2Xmqi4q/YRIB9JPAiIFsKwLybKauUsiIiIywOBDRSaEAJK+h0haCYh/73UmlO0gqecxABERUanBU11UZCLxc4inCw1CDwAg/TDEo97QZd41T2FEREQ5MPhQkYjM20Dy6gIaPAUeD4cQaSVXFBERUT4YfKhIRMp2ABYFN9Ldg0hcUCL1EBERFYTBh4pGFwPAiMm/kzdD6J4WezlEREQFYfCholFUglHBB2lAxtniroaIiKhADD5UJJJ1bxgXfPAc7YiIiIoHgw8ViWTlCSi7GNHSCrDyLvZ6iIiICsLgQ0UmOS4FFC4FtFAANv0hKTQlVhMREVFeGHyoyCTJClKlrYCiKgDpmTX/e3tZNYZkP8McpRERERngzM1kEpKFC1B5F5CyFSLlF0D3CLCoCslmKGDTC5KkNHeJREREDD5kOpLCDrANgGQbYO5SiIiI8sRTXURERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbZSb4BAUFoUWLFrC3t4ezszP69u2LiIgIgzapqakIDAxEpUqVYGdnhwEDBiA2NtZMFRMREVFpU2aCz8GDBxEYGIijR48iJCQEGRkZ6NatG5KSkvRtpkyZgl27dmHLli04ePAgoqOj0b9/fzNWTURERKWJJIQQ5i7iRTx48ADOzs44ePAgOnTogPj4eDg5OWHDhg0YOHAgAODy5cto0KABwsLC0Lp1a6P2m5CQAI1Gg/j4eKjV6uJ8CURERGQixn5/l5kjPjnFx8cDACpWrAgACA8PR0ZGBvz8/PRt6tevDw8PD4SFhZmlRiIiIipdLM1dwIvQ6XSYPHky2rZtC29vbwBATEwMlEolHBwcDNq6uLggJiYm332lpaUhLS1N/3NCQkKx1ExERETmVyaP+AQGBuL8+fMIDg4u8r6CgoKg0Wj0D3d3dxNUSERERKVRmQs+EydOxO7du7F//35Uq1ZNv9zV1RXp6emIi4szaB8bGwtXV9d89zdz5kzEx8frH1FRUcVVOhEREZlZmQk+QghMnDgR27dvx759+1CzZk2D9T4+PrCyskJoaKh+WUREBCIjI+Hr65vvflUqFdRqtcGDiIiIyqcyM8YnMDAQGzZswM6dO2Fvb68ft6PRaGBjYwONRoMxY8Zg6tSpqFixItRqNSZNmgRfX1+jr+giIiKi8q3MXM4uSVKey9esWYOAgAAAWRMYTps2DRs3bkRaWhr8/f2xfPnyAk915cTL2YmIiMoeY7+/y0zwKSkMPkRERGVPuZ/Hh4iIiOh5MfgQERGRbDD4EBERkWww+BAREZFsMPgQERGRbDD4EBERkWww+BAREZFsMPgQERGRbDD4EBERkWww+BAREZFsMPgQERGRbDD4EBERkWww+BAREZFsMPgQERGRbDD4EBERkWww+BAREZFsMPgQERGRbDD4EBERkWww+BAREZFsMPgQERGRbDD4EBERkWww+BAREZFsPFfwSUlJweHDh3Hx4sVc61JTU/Hjjz+arDAiIiIiUzM6+Fy5cgUNGjRAhw4d0KhRI3Ts2BH37t3Tr4+Pj8eoUaOKpUgiIiIiUzA6+EyfPh3e3t64f/8+IiIiYG9vj7Zt2yIyMrI46yMiIiIyGaODz5EjRxAUFITKlSujTp062LVrF/z9/dG+fXvcuHGjOGskIiIiMgmjg09KSgosLS31P0uShBUrVqBXr17o2LEjrly5UiwFEhEREZmKZeFNstSvXx8nTpxAgwYNDJZ/8803AIDevXubtjIiIiIiEzP6iE+/fv2wcePGPNd98803GDZsGIQQJiuMiIiIyNQkwbRiICEhARqNBvHx8VCr1eYuh4iIiIxg7Pc3JzAkIiIi2WDwISIiItlg8CEiIiLZYPAhIiIi2TBZ8NHpdNi9e7epdkdERERkckbP45Ofa9euYfXq1Vi7di0ePHiAjIwMU9RFREREZHIvdMQnJSUFP/74Izp06ABPT08cOXIEs2bNwp07d0xdHxEREZHJPNcRn+PHj+P7779HcHAwateujddeew1HjhzB8uXL4eXlVVw1EhEREZmE0cGncePGSEhIwPDhw3HkyBE0bNgQADBjxoxiK46IiIjIlIw+1RUREYEOHTqgc+fOZju6c+jQIfTq1Qtubm6QJAk7duwwWC+EwKxZs1ClShXY2NjAz88PV69eNUutREREVPoYHXxu3LgBT09PjB8/HtWqVcO7776LU6dOQZKk4qzPQFJSEpo0aYJly5bluX7BggVYsmQJvv32Wxw7dgy2trbw9/dHampqidVIREREpdcL3atr3759WL16NbZt24bU1FS8++67ePPNN1GvXr3iqDFPkiRh+/bt6Nu3L4Csoz1ubm6YNm0a3n33XQBAfHw8XFxcsHbtWgwdOtSo/fJeXURERGVPsd6rq0uXLvj5559x7949fPPNN9i3bx/q16+Pxo0bv3DBRXXz5k3ExMTAz89Pv0yj0aBVq1YICwszW11ERERUehRpAkONRoMJEybgxIkTOHnyJDp16mSisp5fTEwMAMDFxcVguYuLi35dXtLS0pCQkGDwICIiovLJ6OCTkpKC//73v0hMTMy1LiEhAZGRkVi4cKFJiysJQUFB0Gg0+oe7u7u5SyIiIqJiYnTwWbVqFb7++mvY29vnWqdWq7FkyRJ8//33Ji3uebi6ugIAYmNjDZbHxsbq1+Vl5syZiI+P1z+ioqKKtU4iIiIyH6ODz/r16zF58uR810+ePBnr1q0zRU0vpGbNmnB1dUVoaKh+WUJCAo4dOwZfX998t1OpVFCr1QYPIiIiKp+MnsDw6tWraNKkSb7rGzduXOxz5jx9+hTXrl3T/3zz5k2cPn0aFStWhIeHByZPnoz//Oc/qFu3LmrWrImPP/4Ybm5u+iu/iIiISN6MDj6ZmZl48OABPDw88lz/4MEDZGZmmqywvJw4cQKdO3fW/zx16lQAwMiRI7F27Vq8//77SEpKwrhx4xAXF4d27drhjz/+gLW1dbHWRURERGWD0cGnYcOG+PPPP+Hj45Pn+r179+pvY1FcOnXqhIKmHZIkCfPmzcO8efOKtQ4iIiIqm4we4zN69Gh88skn2L17d651u3btwvz58zF69GiTFkdERERkSkYf8Rk3bhwOHTqE3r17o379+vD09AQAXL58GVeuXMHgwYMxbty4YiuUiIiIqKieawLDn3/+GcHBwahbty6uXLmCiIgIeHp6YuPGjdi4cWNx1UhERERkEi90r67yjPfqIiIiKntMfq8unU6Hzz//HG3btkWLFi0wY8YMpKSkmKRYIiIiopJgdPCZP38+PvjgA9jZ2aFq1ar4+uuvERgYWJy1EREREZmU0cHnxx9/xPLly7Fnzx7s2LEDu3btwvr166HT6YqzPiIiIiKTMTr4REZG4tVXX9X/7OfnB0mSEB0dXSyFEREREZma0cEnMzMz1wzIVlZWyMjIMHlRRERERMXB6Hl8hBAICAiASqXSL0tNTcVbb70FW1tb/bJt27aZtkIiIiIiEzE6+IwcOTLXstdff92kxRAREREVJ6ODz5o1a4qzDiIiIqJi91wzNxMRERGVZQw+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbDD5EREQkGww+REREJBsMPkRERCQbRs/cTEREhXuQnITNF87hePRdSJDQ1t0DAxo0hKONDeJTU7H54jlsu3QRT1JTUF3jgGHejdGjriesLCzMXTqRLEhCCGHuIkqThIQEaDQaxMfHQ61Wm7scIipD9t28gcDf/osMnQ66/320SgAqWFkhqGs3BB0+hJinicj+0FVIEnRCoE01d/zQuz9UlvxblOhFGfv9zVNdREQmcOPJY4z/dSfStVp96AEAASAlMxOT//gNsc+EHgD6dkfv3sEXYYdLtmAimWLwISIygZ/OnoZOCOR1CF0nBHQQ0OWzrU4IbDh/FskZGcVZIhGBwYeIyCT+vHEd2iKMHEjOyMCVRw9NWBER5YXBh4jIBDJ02iLvQzJBHURUMAYfIiITeMnVDRbSi0cXe6US9Ss7mbAiIsoLgw8RkQmMbPJSoae68vvAlQCMaPISr+oiKgEMPkREJtC6mjvebukLAAZHfrL/f0qrNqju4Ajg31Na2ev8atXWb0tExYvz+OTAeXyIqCgO3LqJ1adOZE1gKEloU80DbzZrjtbV3JGSkYH/RlzCtssX8SglBTUdHDC0YWN0rlkLiiKcJiMi47+/GXxyYPAhIiIqeziBIREREVEODD5EREQkG7yEgJ6bTqfDiT1ncOHvy5AUEl7q2giNO3hB4hgFIiIq5Rh86LncuhCFWX0+x70bsbCwzLqb9Pr//IJajatj3s7pcKnOeUiIiKj04qkuMtqT2DhM6zQbsbcfAAC0mVpoM7Nmq719MQrTOs9GytMUc5ZIRERUIAYfMtquFXvx9EkSdNrct1rUZuoQe/sBQtfzDtNERFR68VRXOZfwKBF/rN6HS8euQmGhgI9fY3Qe3g42ttbPva/QDX9Bp8vv/tKABAn7Nv6Fnv/3MrSZWpwMPYfH956gYhVHNOvaSH9qjIiIyFzKZfBZtmwZFi5ciJiYGDRp0gRLly5Fy5YtzVpT9nRJxTEAOCUpFfs3/o2If67CwtICLV55CS1ffQnHfz+NT4Z8iYy0DEAAkIBDW8Kw+sONCPrjQ9RtVsvo54h7EI/H954U2EYIgaS4ZOwP/hsrpqzBk9h4/TpHFw3GLx6FzkPbvujLJCIiKrJyN4Hhpk2bMGLECHz77bdo1aoVvvrqK2zZsgURERFwdnYudHtTT2B4MvQctn65Cyf/PAuhE/BsWQcD3umBDoN8TRKCToaew7yBi5AUn6w/oqLN1MLJozIeRz+BTqtFzt+wwkIBW3UFrLu2FPaOdoU+x+n95/Fxr8+QmpxWYDsLSwU8W9TBxbAr+bb5cONkdBrC8ENERKYl25mbW7VqhRYtWuCbb74BkHXptbu7OyZNmoQZM2YUur0pg8+Opb9j2TurobBQ6MfFKBQSdDqBvpNewYSvRhUp/ERevou3mr2HzPRMCF2OX6OErKM8+ZAkCW99MRL9J/co8DkeRj9GQL23kZ6anvs58qCuZIeER0/zXV/JzRHrb6+AhQVPexERkenIcubm9PR0hIeHw8/PT79MoVDAz88PYWFhJVrL7YtRWDZ5NQAYDAbW/S887Fj6O479erJIz7Htq1+hy9TmHUgKyShCCPy9859Cn+O3VX8iIzXDqNDTovtLBYYeAHgU/QRnD14sdF9ERETFoVwFn4cPH0Kr1cLFxcVguYuLC2JiYvLcJi0tDQkJCQYPU9j9bQgsLPLvXoWFAjuW/lak5zi0NQzazPwHGxcmPTWj0Db//H6ywAHNQNZrGf9lAPxe72DU8z6JiTOqHRERkamVq+DzIoKCgqDRaPQPd3d3k+z38vFrBYYSnVaHiBPXi/QcaYWMuSmIhaUC9VvWKbRdZoa20DbqyvboP7kHnNwrGfXclapWNKodERGRqZWr4FO5cmVYWFggNjbWYHlsbCxcXV3z3GbmzJmIj4/XP6KiokxSi5Wq8AvmrJRFu6iuhrcHJMWLjRHSanXo+Va3Qts1bOMJhWX+bxMLSwW829bPatvWE87VnZDfsCVJApyrO6FR+wYvVDMREVFRlavgo1Qq4ePjg9DQUP0ynU6H0NBQ+Pr65rmNSqWCWq02eJiCb68WBYYSC0sF2vQp2iX2vSf4GzX2RvFMHYr/nX4b/2UAqjeoVui2vcb75zlhYTZtpg59J73yv+dRYNLSMQCkXOEn62cJgV+PgkJRrt52RERUhpS7b6CpU6fiu+++w7p163Dp0iWMHz8eSUlJGDVqVInW4T+qE2w1FfRB41mSlHVVVb+3XynSc/i90QFt+rbIFTKyg86o/wzDzPXvoK5PbUhSVuh5qYs3PtvzEfq/U/DVXNlqNHTH28vGAsgKa/rn+N/rGjFnMJp0bKhf3rqnD+btnA7XmobjrFxrumDezulo07vFc79OIiIiUyl3l7MDwDfffKOfwLBp06ZYsmQJWrVqZdS2prycPeLEdczs/h8kPnkKCRIEBCRJgpXSEh9tmgrfXs2LtH8ga86eHUt/x/alvyH2VtY9tDxb1MGQ9/ug/YDW+nY6nQ6SJL3w5fPn/76MbV/9ivCQMxA6gUYdGqDf2z3QvFuTPNsLIXDp6BU8uheHSlUc0KB1Pd69nYiIio1s5/EpKlNPYJjyNAV//vwXToWehTZTBy9fT3Qf3RmayqY5pZZNCIHEx09hYWUBW3UFk+6biIiotGPweUGmDj5kWneuROOvX44hOTEF7p5u6DDIF9YVVOYui4iIzMzY7+9yea8uKn/SUtKwaPRyHNh0BAoLBRQKCZkZWix7ZzXe/WGCwWk9IiKi/PCITw7mOOITefkuQn8+hCex8ahctSJeHtERVWq5FL6hjHwy5Av89cuxPG/NIUkSFv45G006Ncx7YyIiKvd4qusFlWTw0WZqsWTi9/ht1Z//u2JKghACOp0OA6f0wriFbxQ6IFgIgfOHL+PcX5cgSRKadPIqdwOJb12IwthGU/Ndr7BQwLtdfXyxf24JVkVERKUJT3WVAWs+2ojfv/sTAHLN8rz1y13QVLbH0Bn98t3+3o1YzO6/ADfPRuovL9dpdajrUwtzfnkXzh5OxVd8CTq0JczgRq856bQ6nD14EXEP4uHgpCnh6oiIqCwpd/P4lBVP45KwbclvKOh4W/DnO5Cemp7nusQnTzGlw8eIvHgHQNaXf3YwuHHmFqZ2mo3kxBST120OSfHJBpMw5iclMbUEqiEiorKMwcdMwkPOIqOQm4QmxSfj3F+X8lz3+/eheBwTl+f9wLSZOsTefoCQHw+apFZzq1q3SqE3Y1VaW8HR1aFkCiIiojKLwcdMUpOMOzqRlpz3EZ8/1x8q8HYVEoDQ9X+9SGmlTpfh7Qq895nCUoGXR3TiZe1ERFQoBh8zqdHQuLvAe3jlfT+txEdPC9xOCCDxceJz11Ua2TnY4u3lWbfNyHn/M4WFAk5VK2Hk3MEmea7MjEzcPB+J62du5XuakYiIyi4ObjaTes1ro3aT6rh5PirPQbsKCwUatW+AanWr5Ll91XpV8DgmLt8BvwoLBarWczNpzebkH5A12/WPczfjavgNAICVyhJdX+uA0fOHwdHFoUj712q12LJoF35ZvBtx9+MBABXUFdB7fDe8MXsQlNbKor4EIiIqBXg5ew4leTn79TO3MKXDx0hPSTcYw2JhqUAFdQUsCfs03+BzcPMR/Gfo4gL3P2/ndJPcD6y0uR/1ECmJKXByr4wK9jZF3p8QAgtHLUPITweBnNMEKSQ07eyNT3/7AJZW/DuBiKi0Mvb7m6e6zKh2kxpYfvxzdB7WDpZWFgCyjmJ0G9kZK8IX5Bt6AKDdgFZo2aNZrlM/QNaEfu0HtEKrHs2KrXZzcnavjOpe7iYJPQBw5sCFrIHgefwJIHQCp0LPlZvxUkREcscjPjmY615d6anpSIpPhq2DLZQqK6O2yUjPwPpPfsGOb35HUnwyAEBdyR793n4Vw2b2g4WlRXGWXG7MH74Yf209mu+VY5JCgmeLOlga9mkJV0ZERMbiBIZljNJamec4EiEETu8/j9P7zkMIgUbtG8CnWxMoFApYKa0Q8MlQDP+wPyIv34UkSfBoUBVWSuOCE2WJuhxd4OXyQicQfS2mBCsiIqLiwuBTit27EYuPe3+G2xfv6I/ebAzaDrfaLpi3czqqe2VdGaa0VqJO05rmLLVMU1e0g6SQCpwewM6hQglWRERExYVjfEqppIRkTO00G3euRAPIuq+XNlMLAIi+HouJrWYi+gaPQphCl+HtCww9CoUCfq93LMGKiIiouDD4lFIh6w7i0d3H+Z6CSU1Kw8SWM/E45kkJV1b+dB7WFlXrVvnfjWINKSwUsK9kh55vvWyGyoiIyNQYfEqpfRsPQ+R1mdEzEh8/xYKAZSVUUfmlslFh0f45qOtTG0BW2MkOQW51XPHFgblFnieIiIhKB47xKaWexj3N8/LqnML3nsGdK9GoVo4mKzSHym4VsTTsU1z+5ypO/nkOOq0ODdt6omlnb0hS4TdIJSKisoHBp5Sq7uWOu1fuQVfA2JNsF8OuMPiYSP2WdVG/ZV1zl0FERMWEp7pKqZ7/97JRoQfIff8qIiIiyhuDTynVzK8xOg1pU2g7SZLQpKNXCVRERERU9jH4lFKSJGHm+nfg5Vsv3zYKCwXaD2wFZw+nEqyMiIio7GLwKcUUCgUW/DkLTbt4Z/1sIf1vedZ/6zarhamr3jJbfURERGUNBzeXciobFT7b8xGO7T6J31eHIvbWA1Ryqwj/gE5o178V7xhORET0HHiT0hzMdZNSIiKi8uhBchLWnz2DHZcv4klqCtw1DhjRuCn61veC0sJ0N9M29vubwScHBh8qLnev3cO5Q5cghIB3u/pw96xq7pKIiIrV9cePMOSXTXiSkpJrarqaDo7YPex12FjlvkH3i+Dd2YlKifiHCVgQ8A3++e2UwfJmLzfGjB8ncVZoIiqXhBB469f/5hl6AOBm3BMM2hKMXcPeKNGJYjm4magYpaWk4d0uc3Biz5lc606FnsOUDh8j5WlKyRdGRFTMjt6JwvUnjwu8CcHFhw9w8NbNEqsJYPAhKlb7NhzGrfNR0Glz32xW6ATuXo3BzmV7zFAZEVHxOhlzz6h2358KL+ZKDDH4EBWjvesOFNpm04IdxV4HEVFJszDy9NWt+CfFXIkhBh+iYnQ/8mGhbZ4+ScLD6MclUA0RUclp61HdqHaO1jbFXIkhBh+iYmRta21Uu+unSvYcNxFRcWvk7IIqdvaFtutVr34JVPMvBh+iYtSkc0Oj2lkqeYElEZU/3/fqC0UBp7wcVdYY5OVdghUx+BAVqyHv9ym0jaqCssB7shERlVUNnJyxZeBQ2Cv/nasnOwa52tph48AhcLQp2VNd/DOTqBi5eDihy/B22LfhcL5t+k16FTZ2JfsPn4iopLxUxQ0nxk5AyI1r+OfuHUiShFZV3eFXqzYsFSV//IXBh6iYTVn1FuLux+Pkn+cgKSQInYDCQoJOK9BpSBsEfDLU3CUSERUrKwsLvFrXE6/W9TR3KQw+RMXNuoIKQX98hFOh5xDy00E8iYmDs3tl+I/ugoZtPEt0xlIiIrlj8CEqAQqFAj4vN4HPy03MXQoRkayVmcHN8+fPR5s2bVChQgU4ODjk2SYyMhI9evRAhQoV4OzsjPfeew+ZmZklWygRERGVWmXmiE96ejoGDRoEX19f/PDDD7nWa7Va9OjRA66urjhy5Aju3buHESNGwMrKCp9++qkZKiYiIqLSRhJCFHT/sFJn7dq1mDx5MuLi4gyW//777+jZsyeio6Ph4uICAPj2228xffp0PHjwAEqlcbe9N/a29kRERFR6GPv9XWZOdRUmLCwMjRo10oceAPD390dCQgIuXLiQ73ZpaWlISEgweBAREVH5VG6CT0xMjEHoAaD/OSYmJt/tgoKCoNFo9A93d/dirZOIiIjMx6zBZ8aMGZAkqcDH5cuXi7WGmTNnIj4+Xv+Iiooq1ucjIiIi8zHr4OZp06YhICCgwDa1atUyal+urq74559/DJbFxsbq1+VHpVJBpVIZ9RyUt6SEZIT+/BfOHb4ESQKadGyILsPbcTZiIiIqdcwafJycnODk5GSSffn6+mL+/Pm4f/8+nJ2dAQAhISFQq9Xw8vIyyXNQbuf+uoSPe3+GpITkrBvRSRL2b/wbq6b/hOnrJsG3V3NO0EdERKVGmRnjExkZidOnTyMyMhJarRanT5/G6dOn8fTpUwBAt27d4OXlhTfeeANnzpzBnj178NFHHyEwMJBHdIrJ/aiHmPnKfKQkpgAC0OkEdFodACA5PgWz+y7AG7UCsevbvShjFw8SEVE5VWbm8Zk1axbWrVun//mll14CAOzfvx+dOnWChYUFdu/ejfHjx8PX1xe2trYYOXIk5s2bZ66Sy71dK/YiIy0DOl3+oSb29gMsmfAdbp67jUnfvMmjP0REZFZlbh6f4sZ5fIw3qsE7uBMRbXT7RfvmoEmnhsVYERERyZXs5vGhkpeekm50WwtLBXav2luM1RARERWOwYdemGeLOrCwNO4tpM3U4eY5ThVARETmVWbG+FDp03uCP/765ajR7SvY8/J2IipZQgiE3ryOH8+cwoUH96G0sIB/7boY2bQZajo4mrs8MgMe8aEX1rSzNwa/29uotpIkodOQNsVcERHRv4QQ+Hj/nxi3eyfC7kThSWoqYpOSsP7cGby6fh0OR942d4lkBgw+VCRvfv46Pto0FXWa1si3jcJCAUcXDbqN7FRidRERbb98ERvOnwUAaJ+5jkcrBNK1Wrz1604kpKWZqzwyEwYfKhJJktBxkC9WnFyIZf98hkpVKwIALKwsYGFlAQBwreGERfvnwM7B1pylEpHM/HAqHArkPYWGAJCSkYHtl/O/iTWVTxzjQyZTr3lt/HxjGcJ2hePC4UuAJOGlLt5o3r0pLCwszF0eEclIhlaLSw8fFNhGkiSE34vGyCbNSqgqKg0YfMikLK0s0b5/K7Tv38rcpRCRjBkzWaoEwELiiQ+54W+ciIjKHUuFAi3dqmXdQzAfWiHQ1t2jBKui0oDBh4iIyqWxPs2hy+fmBBaShEo2FdCznmcJV0XmxuBDRETlUteatfF+m/YAsoJONgmAWqXCur4DYG1pZabqyFw4xoeIiMqtt5q3RMcaNbH+3Bmci42BtaUlXq5VB4O8vKGxtjZ3eWQGDD5ERFSuNajshP909jN3GVRKMPgQEVGZdeF+LEJuXEdqZgbqV3bCK3XqQWXJrzbKH98dRERU5iSkpWLib7txOOo2LCQJkiQhU6fDnIP78LV/D3SsUdPcJVIpxcHNRERUpgghMG7XToTdiQSQdVl6pk4HAEhMS8PY3Ttw7n6sOUukUozBh4iIypR/7t7BP9F3DO6/lU0gKxitOH6s5AujMoHBh4iIypRfr0bAUpH/15dWCITcuIYMrbYEq6KygsGHiIjKlKSMDIh8JibMphUCqZmZJVQRlSUMPkREVKbUdHBEwbEHcLS2ga1SWSL1UNnC4ENERGXKQK+GBa5XSBJeb9ykwPt0kXwx+BARUZniamePD9t3ApB1+4lnWUgS6lashLHNWpR4XVQ2cB4fIiIqc0Y1bQYXWzss/ScMEY8eAgAqWFlhSMNGeKdVG9jxNBflg8GHiIjKpFfr1sMrdeoi5ulTpGozUcXOjjcdpUIx+BARUZklSRKq2NubuwwqQzjGh4iIiGSDwYeIiIhkg8GHiIiIZIPBh4iIiGSDwYeIiIhkg8GHiIiIZIPBh4iIiGSDwYeIiIhkg8GHiIiIZIPBh4iIiGSDwYeIiIhkg8GHiIiIZIPBh4iIiGSDwYeIiIhko0wEn1u3bmHMmDGoWbMmbGxsULt2bcyePRvp6ekG7c6ePYv27dvD2toa7u7uWLBggZkqJiIiotLI0twFGOPy5cvQ6XRYuXIl6tSpg/Pnz2Ps2LFISkrCokWLAAAJCQno1q0b/Pz88O233+LcuXMYPXo0HBwcMG7cODO/AiIiIioNJCGEMHcRL2LhwoVYsWIFbty4AQBYsWIFPvzwQ8TExECpVAIAZsyYgR07duDy5ctG7zchIQEajQbx8fFQq9XFUjsRERGZlrHf32XiVFde4uPjUbFiRf3PYWFh6NChgz70AIC/vz8iIiLw5MmTfPeTlpaGhIQEgwcRERGVT2Uy+Fy7dg1Lly7F//3f/+mXxcTEwMXFxaBd9s8xMTH57isoKAgajUb/cHd3L56iiYiIyOzMGnxmzJgBSZIKfOQ8TXX37l10794dgwYNwtixY4tcw8yZMxEfH69/REVFFXmfREREVDqZdXDztGnTEBAQUGCbWrVq6f8/OjoanTt3Rps2bbBq1SqDdq6uroiNjTVYlv2zq6trvvtXqVRQqVTPWTkRERGVRWYNPk5OTnBycjKq7d27d9G5c2f4+PhgzZo1UCgMD1b5+vriww8/REZGBqysrAAAISEh8PT0hKOjo8lrJyIiorKnTIzxuXv3Ljp16gQPDw8sWrQIDx48QExMjMHYneHDh0OpVGLMmDG4cOECNm3ahK+//hpTp041Y+VERERUmpSJeXxCQkJw7do1XLt2DdWqVTNYl301vkajwd69exEYGAgfHx9UrlwZs2bN4hw+REREpFdm5/EpLpzHh4iIqOwp9/P4EBERET0vBh8iIiKSjTIxxoeK5vI/V7F9yW84FXoOQgAvdfFG37dfhVfreuYujYiIqERxjE8O5WWMT0pSKn7/PhTBn+/Ak5g4g3UWlgpoM3WYuHQM+gR2N0+BREREJmTs9zeP+JRDCY8TMa3TbNw6n/cs1NpMHQDgm7d/gJdvPdRtVivPdkREROUNx/iUQ99MWo3IS3cLbWdhocDOZX+UQEVERESlA4NPOfMkNg4HtxyBTqsrtK02U4czBy6UQFVERESlA4NPOXMl/AZ0mYWHnmwWlnwLEBGRfPBbr5xRWBj/K1VYKtDC/6VirIaIiKh0YfApZ7x860FpbVV4QwmQIKF3oH/xF0VERFRKMPiUM7bqCuj1VjdICin/RhJgYWmBD4OnwN2zaskVR0REZGa8nL0cGh30Gu5ei8HR3eFQWCqyxvxIAARgbWuNHuO6ou+kV+Faw9ncpRIREZUoBp9ySKmywtwd7yM85Cz+WB2K2NsPUcnVES+P7AjfXs1hYWlh7hKJiIjMgsGnnFIoFGjh3xQt/JuauxQiIqJSg2N8iIiISDYYfIiIiEg2GHyIiIhINhh8iIiISDYYfIiIiEg2GHyIiIhINhh8iIiISDYYfIiIiEg2GHyIiIhINhh8iIiISDZ4y4ochBAAgISEBDNXQkRERMbK/t7O/h7PD4NPDomJiQAAd3d3M1dCREREzysxMREajSbf9ZIoLBrJjE6nQ3R0NOzt7ZGYmAh3d3dERUVBrVabu7RSKSEhgX1UAPZP4dhHBWP/FI59VDg59JEQAomJiXBzc4NCkf9IHh7xyUGhUKBatWoAAEmSAABqtbrcvlFMhX1UMPZP4dhHBWP/FI59VLjy3kcFHenJxsHNREREJBsMPkRERCQbDD4FUKlUmD17NlQqlblLKbXYRwVj/xSOfVQw9k/h2EeFYx/9i4ObiYiISDZ4xIeIiIhkg8GHiIiIZIPBh4iIiGSDwYeIiIhkg8EnD7du3cKYMWNQs2ZN2NjYoHbt2pg9ezbS09MN2p09exbt27eHtbU13N3dsWDBAjNVbB7z589HmzZtUKFCBTg4OOTZJjIyEj169ECFChXg7OyM9957D5mZmSVbqBktW7YMNWrUgLW1NVq1aoV//vnH3CWZzaFDh9CrVy+4ublBkiTs2LHDYL0QArNmzUKVKlVgY2MDPz8/XL161TzFmkFQUBBatGgBe3t7ODs7o2/fvoiIiDBok5qaisDAQFSqVAl2dnYYMGAAYmNjzVRxyVqxYgUaN26sn4DP19cXv//+u369nPsmL5999hkkScLkyZP1y9hHWRh88nD58mXodDqsXLkSFy5cwOLFi/Htt9/igw8+0LdJSEhAt27dUL16dYSHh2PhwoWYM2cOVq1aZcbKS1Z6ejoGDRqE8ePH57leq9WiR48eSE9Px5EjR7Bu3TqsXbsWs2bNKuFKzWPTpk2YOnUqZs+ejZMnT6JJkybw9/fH/fv3zV2aWSQlJaFJkyZYtmxZnusXLFiAJUuW4Ntvv8WxY8dga2sLf39/pKamlnCl5nHw4EEEBgbi6NGjCAkJQUZGBrp164akpCR9mylTpmDXrl3YsmULDh48iOjoaPTv39+MVZecatWq4bPPPkN4eDhOnDiBLl26oE+fPrhw4QIAefdNTsePH8fKlSvRuHFjg+Xso/8RZJQFCxaImjVr6n9evny5cHR0FGlpafpl06dPF56enuYoz6zWrFkjNBpNruW//fabUCgUIiYmRr9sxYoVQq1WG/RbedWyZUsRGBio/1mr1Qo3NzcRFBRkxqpKBwBi+/bt+p91Op1wdXUVCxcu1C+Li4sTKpVKbNy40QwVmt/9+/cFAHHw4EEhRFZ/WFlZiS1btujbXLp0SQAQYWFh5irTrBwdHcX333/PvnlGYmKiqFu3rggJCREdO3YU77zzjhCC759n8YiPkeLj41GxYkX9z2FhYejQoQOUSqV+mb+/PyIiIvDkyRNzlFjqhIWFoVGjRnBxcdEv8/f3R0JCgv6vtPIqPT0d4eHh8PPz0y9TKBTw8/NDWFiYGSsrnW7evImYmBiD/tJoNGjVqpVs+ys+Ph4A9J874eHhyMjIMOij+vXrw8PDQ3Z9pNVqERwcjKSkJPj6+rJvnhEYGIgePXoY9AXA98+zeJNSI1y7dg1Lly7FokWL9MtiYmJQs2ZNg3bZX/AxMTFwdHQs0RpLo5iYGIPQAxj2UXn28OFDaLXaPF//5cuXzVRV6ZX9fsirv8r7eyUvOp0OkydPRtu2beHt7Q0gq4+USmWu8XRy6qNz587B19cXqampsLOzw/bt2+Hl5YXTp0/Lvm8AIDg4GCdPnsTx48dzreP751+yOuIzY8YMSJJU4CPnl9Ldu3fRvXt3DBo0CGPHjjVT5SXnRfqIiEwrMDAQ58+fR3BwsLlLKVU8PT1x+vRpHDt2DOPHj8fIkSNx8eJFc5dVKkRFReGdd97B+vXrYW1tbe5ySjVZHfGZNm0aAgICCmxTq1Yt/f9HR0ejc+fOaNOmTa5By66urrlGw2f/7OrqapqCzeB5+6ggrq6uua5iKg99ZIzKlSvDwsIiz/dIeX/tLyK7T2JjY1GlShX98tjYWDRt2tRMVZnHxIkTsXv3bhw6dAjVqlXTL3d1dUV6ejri4uIM/mqX03tKqVSiTp06AAAfHx8cP34cX3/9NYYMGSL7vgkPD8f9+/fRrFkz/TKtVotDhw7hm2++wZ49e2TfR9lkFXycnJzg5ORkVNu7d++ic+fO8PHxwZo1a6BQGB4c8/X1xYcffoiMjAxYWVkBAEJCQuDp6VmmT3M9Tx8VxtfXF/Pnz8f9+/fh7OwMIKuP1Go1vLy8TPIcpZVSqYSPjw9CQ0PRt29fAFmnL0JDQzFx4kTzFlcK1axZE66urggNDdUHnYSEBP1f9nIghMCkSZOwfft2HDhwINepdB8fH1hZWSE0NBQDBgwAAERERCAyMhK+vr7mKNnsdDod0tLS2DcAunbtinPnzhksGzVqFOrXr4/p06fD3d1d9n2kZ+7R1aXRnTt3RJ06dUTXrl3FnTt3xL179/SPbHFxccLFxUW88cYb4vz58yI4OFhUqFBBrFy50oyVl6zbt2+LU6dOiblz5wo7Oztx6tQpcerUKZGYmCiEECIzM1N4e3uLbt26idOnT4s//vhDODk5iZkzZ5q58pIRHBwsVCqVWLt2rbh48aIYN26ccHBwMLjKTU4SExP17xEA4ssvvxSnTp0St2/fFkII8dlnnwkHBwexc+dOcfbsWdGnTx9Rs2ZNkZKSYubKS8b48eOFRqMRBw4cMPjMSU5O1rd56623hIeHh9i3b584ceKE8PX1Fb6+vmasuuTMmDFDHDx4UNy8eVOcPXtWzJgxQ0iSJPbu3SuEkHff5OfZq7qEYB9lY/DJw5o1awSAPB/POnPmjGjXrp1QqVSiatWq4rPPPjNTxeYxcuTIPPto//79+ja3bt0Sr7zyirCxsRGVK1cW06ZNExkZGeYruoQtXbpUeHh4CKVSKVq2bCmOHj1q7pLMZv/+/Xm+X0aOHCmEyLqk/eOPPxYuLi5CpVKJrl27ioiICPMWXYLy+8xZs2aNvk1KSoqYMGGCcHR0FBUqVBD9+vUz+IOsPBs9erSoXr26UCqVwsnJSXTt2lUfeoSQd9/kJ2fwYR9lkYQQogQPMBERERGZjayu6iIiIiJ5Y/AhIiIi2WDwISIiItlg8CEiIiLZYPAhIiIi2WDwISIiItlg8CEiIiLZYPAhIiIi2WDwIaJSLSAgAJIkQZIk/U0q582bh8zMTH0bIQRWrVqFVq1awc7ODg4ODmjevDm++uorJCcnG+zvzp07UCqV8Pb2Nur5Dx06hF69esHNzQ2SJGHHjh2mfHlEVMIYfIio1OvevTvu3buHq1evYtq0aZgzZw4WLlyoX//GG29g8uTJ6NOnD/bv34/Tp0/j448/xs6dO7F3716Dfa1duxaDBw/W3wS1MElJSWjSpAmWLVtm8tdFRCWPt6wgolItICAAcXFxBkdaunXrhsTERISFhWHz5s0YMmQIduzYgT59+hhsK4RAQkICNBqN/uc6depg+fLl2L9/Px4/foxVq1YZXYskSdi+fTv69u1ripdGRGbAIz5EVObY2NggPT0dALB+/Xp4enrmCj1AVlDJDj0AsH//fiQnJ8PPzw+vv/46goODkZSUVGJ1E5H5MfgQUZkhhMCff/6JPXv2oEuXLgCAq1evwtPT06jtf/jhBwwdOhQWFhbw9vZGrVq1sGXLluIsmYhKGUtzF0BEVJjdu3fDzs4OGRkZ0Ol0GD58OObMmQMgKwwZIy4uDtu2bcPhw4f1y15//XX88MMPCAgIKIaqiag0YvAholKvc+fOWLFiBZRKJdzc3GBp+e9HV7169XD58uVC97FhwwakpqaiVatW+mVCCOh0Oly5cgX16tUrltqJqHThqS4iKvVsbW1Rp04deHh4GIQeABg+fDiuXLmCnTt35tpOCIH4+HgAWae5pk2bhtOnT+sfZ86cQfv27bF69eoSeR1EZH4MPkRUpg0ePBhDhgzBsGHD8Omnn+LEiRO4ffs2du/eDT8/P/3l7SdPnsSbb74Jb29vg8ewYcOwbt06g3mBnvX06VN9UAKAmzdv4vTp04iMjCzBV0lEpsLL2YmoVMvrcvacdDodVq1ahdWrV+PChQuwtLRE3bp1MWLECIwdOxbvv/8+9u3bhwsXLuTaNiYmBlWrVsX27dvRu3fvXOsPHDiAzp0751o+cuRIrF27tigvjYjMgMGHiIiIZIOnuoiIiEg2GHyIiIhINhh8iIiISDYYfIiIiEg2GHyIiIhINhh8iIiISDYYfIiIiEg2GHyIiIhINhh8iIiISDYYfIiIiEg2GHyIiIhINhh8iIiISDb+H+UPjO7li0SEAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "num_clusters = 3 # Replace with the chosen number of clusters\n", + "kmeans = KMeans(n_clusters=num_clusters)\n", + "clusters = kmeans.fit_predict(pca_features)\n", + "\n", + "# Visualizing the clusters\n", + "plt.scatter(pca_features[:, 0], pca_features[:, 1], c=clusters, cmap=\"viridis\")\n", + "plt.xlabel(\"PCA 1\")\n", + "plt.ylabel(\"PCA 2\")\n", + "plt.title(\"Clusters visualized in PCA-reduced space\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "5f7e2f2d-bd02-48c4-90fa-3f8e5b0f5dc2", + "metadata": {}, + "source": [ + "## View the mapping of each filename to its corresponding cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "24039577-3e96-43e1-9e22-918499328f5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Cluster Filename\n", + " 0 Accessmedia_synthesized_1.pdf\n", + " 0 Accessmedia_synthesized_2.pdf\n", + " 0 Hotwire_synthesized_1.pdf\n", + " 0 Hotwire_synthesized_2.pdf\n", + " 0 Lion_Co_synthesized_1.pdf\n", + " 0 Lion_Co_synthesized_2.pdf\n", + " 1 ACME_synthesized_1.pdf\n", + " 1 ACME_synthesized_2.pdf\n", + " 1 ADP_synthesized_1.pdf\n", + " 1 ADP_synthesized_2.pdf\n", + " 1 American_synthesized_1.pdf\n", + " 1 American_synthesized_2.pdf\n", + " 1 Cele_synthesized_1.pdf\n", + " 1 Cele_synthesized_2.pdf\n", + " 1 Covey_synthesized_1.pdf\n", + " 1 Covey_synthesized_2.pdf\n", + " 1 Globex_synthesized_1.pdf\n", + " 1 Globex_synthesized_2.pdf\n", + " 1 IETEC_synthesized_1.pdf\n", + " 1 IETEC_synthesized_2.pdf\n", + " 1 Imagintime_synthesized_1.pdf\n", + " 1 Imagintime_synthesized_2.pdf\n", + " 1 Jacks_synthesized_1.pdf\n", + " 1 Jacks_synthesized_2.pdf\n", + " 2 Brick_company_1.pdf\n", + " 2 Brick_company_2.pdf\n", + " 2 FWC_synthesized_1.pdf\n", + " 2 FWC_synthesized_2.pdf\n" + ] + } + ], + "source": [ + "# Create a DataFrame for cluster mapping\n", + "cluster_data = []\n", + "for cluster, filenames in cluster_mapping.items():\n", + " for filename in set(filenames): # Use set to avoid duplicate filenames\n", + " cluster_data.append({\"Cluster\": cluster, \"Filename\": filename})\n", + "\n", + "df = pd.DataFrame(cluster_data)\n", + "\n", + "# Sort by cluster and then filename for better readability\n", + "df.sort_values(by=[\"Cluster\", \"Filename\"], inplace=True)\n", + "\n", + "# Display the DataFrame as a table\n", + "print(df.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be18eb2a-e0ab-4139-8f6f-a229431d8ade", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "conda-root-py", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/docai_pdf_clustering_analysis_tool/images/cc2.png b/incubator-tools/docai_pdf_clustering_analysis_tool/images/cc2.png new file mode 100644 index 000000000..703e7ad53 Binary files /dev/null and b/incubator-tools/docai_pdf_clustering_analysis_tool/images/cc2.png differ diff --git a/incubator-tools/docai_pdf_clustering_analysis_tool/images/cc3.png b/incubator-tools/docai_pdf_clustering_analysis_tool/images/cc3.png new file mode 100644 index 000000000..8cfea418f Binary files /dev/null and b/incubator-tools/docai_pdf_clustering_analysis_tool/images/cc3.png differ diff --git a/incubator-tools/document-schema-from-form-parser-output/Images/CSV_output.png b/incubator-tools/document-schema-from-form-parser-output/Images/CSV_output.png new file mode 100644 index 000000000..5057bbbaa Binary files /dev/null and b/incubator-tools/document-schema-from-form-parser-output/Images/CSV_output.png differ diff --git a/incubator-tools/document-schema-from-form-parser-output/Images/Form_parser_output.png b/incubator-tools/document-schema-from-form-parser-output/Images/Form_parser_output.png new file mode 100644 index 000000000..2e849186a Binary files /dev/null and b/incubator-tools/document-schema-from-form-parser-output/Images/Form_parser_output.png differ diff --git a/incubator-tools/document-schema-from-form-parser-output/Images/processor_output.png b/incubator-tools/document-schema-from-form-parser-output/Images/processor_output.png new file mode 100644 index 000000000..d465cd91f Binary files /dev/null and b/incubator-tools/document-schema-from-form-parser-output/Images/processor_output.png differ diff --git a/incubator-tools/document-schema-from-form-parser-output/README.md b/incubator-tools/document-schema-from-form-parser-output/README.md new file mode 100644 index 000000000..2e3f631f1 --- /dev/null +++ b/incubator-tools/document-schema-from-form-parser-output/README.md @@ -0,0 +1,31 @@ +# Purpose and Description + +This Document guides to create document schema from key value pairs of form parser output in csv format (The generated schema can be reviewed, updated). +This schema can be used to update for any parser through API. + +## Setup the required inputs +* **project_id** : Your Google project id or name +* **formparser_parsed_jsons_path** : GCS storegar path where the form parser output is saved + +## CSV schema output + +Form parser output in UI + +Form_parser_output + +Retrieved schema from code in the form of csv(‘document_schema.csv’) + +CSV_output + +## Updating Schema to another parser + +## Setup the inputs +* **project_id** : Your Google project id or name +* **location_processo** : Location of processor +* **processor_id** : to which schema has to be updated +* **updated_schema_csv_path** : csv file modified or reviewed from above step + +## Output +The above script adds the schema in the parser as below + +processor_output diff --git a/incubator-tools/document-schema-from-form-parser-output/document-schema-from-form-parser-output.ipynb b/incubator-tools/document-schema-from-form-parser-output/document-schema-from-form-parser-output.ipynb new file mode 100644 index 000000000..ee4db2e45 --- /dev/null +++ b/incubator-tools/document-schema-from-form-parser-output/document-schema-from-form-parser-output.ipynb @@ -0,0 +1,542 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "55646894-baf1-4c1c-9790-5fc16468a282", + "metadata": {}, + "source": [ + "# Document Schema from Form Parser Output" + ] + }, + { + "cell_type": "markdown", + "id": "9c1e5f0d-91ea-4766-8ab3-f66429e66e1b", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "528c5a36-a860-468f-a30a-08d051880aee", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "c195555a-cb6d-4896-af89-3e97b3312cc1", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This Document guides to create document schema from key value pairs of form parser output in csv format (The generated schema can be reviewed, updated). This schema can be used to update for any parser through API.\n" + ] + }, + { + "cell_type": "markdown", + "id": "3b5734e7-8944-485a-9412-ae063ef57318", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "* Python : Jupyter notebook (Vertex AI) \n", + "* Service account permissions in projects.\n", + "* GCS Folder Path which has form parser parsed jsons\n" + ] + }, + { + "cell_type": "markdown", + "id": "ce627c16-65b6-4870-b0b7-28d5fe599905", + "metadata": {}, + "source": [ + "## Step by Step procedure \n", + "\n", + "### 1.Importing Required Modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d1964d8-b1e4-42b7-a399-40861581c371", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f1928e8-b562-443d-b17d-88f42cebda45", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import re\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "from google.cloud import storage\n", + "from tqdm import tqdm\n", + "from utilities import *\n", + "import pandas as pd\n", + "from collections import Counter, defaultdict\n", + "import csv" + ] + }, + { + "cell_type": "markdown", + "id": "9fdcdd55-89b2-4b67-838e-d82fc5c2d957", + "metadata": {}, + "source": [ + "### 2.Setup the required inputs\n", + "* `project_id` : Your Google project id or name\n", + "* `formparser_parsed_jsons_path` : GCS storegar path where the form parser output is saved" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d717d6b4-4bdd-4d04-b9e4-da57f4dd008f", + "metadata": {}, + "outputs": [], + "source": [ + "project_id = \"xxx-xxxx-xxxx\" # your project id\n", + "formparser_parsed_jsons_path = \"gs://xxxx/xxxx/xxx/\" # path of the form parser output" + ] + }, + { + "cell_type": "markdown", + "id": "10d320b8-c2c8-46b9-8c10-33c5147347a6", + "metadata": {}, + "source": [ + "### 3.Importing Required functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92e49ea6-f84c-47e5-a562-1632f8e6e2a5", + "metadata": {}, + "outputs": [], + "source": [ + "def get_schema_file(json_dict: object) -> List[Dict[str, Any]]:\n", + " \"\"\"\n", + " Extracts schema information from Document AI output and returns a list of entities with their types and occurrences.\n", + "\n", + " Args:\n", + " json_dict (object): The OCR output in the form of a Document AI document.\n", + "\n", + " Returns:\n", + " List[Dict[str, Any]]: A list of dictionaries representing entities with their types, occurrences, mention text, and value type.\n", + " \"\"\"\n", + "\n", + " entities_kv = []\n", + " for page_number, page_data in enumerate(json_dict.pages):\n", + " for formField_number, formField_data in enumerate(\n", + " getattr(page_data, \"form_fields\", [])\n", + " ):\n", + " # Cleaning the entity name\n", + " key_name = re.sub(\n", + " r\"[^\\w\\s]\",\n", + " \"\",\n", + " formField_data.field_name.text_anchor.content.replace(\" \", \"_\")\n", + " .lower()\n", + " .strip(),\n", + " )\n", + " if key_name[-1] == \"_\":\n", + " key_name = key_name[:-1]\n", + " if key_name:\n", + " # print(formField_data.field_value.bounding_poly.normalized_vertices)\n", + " ent_xy = {\"x\": [], \"y\": []}\n", + " text_anc = []\n", + " for xy in formField_data.field_value.bounding_poly.normalized_vertices:\n", + " ent_xy[\"x\"].append(xy.x)\n", + " ent_xy[\"y\"].append(xy.y)\n", + " for anc in formField_data.field_value.text_anchor.text_segments:\n", + " text_anc.append(\n", + " {\"start_index\": anc.start_index, \"end_index\": anc.end_index}\n", + " )\n", + "\n", + " page_anc_1 = [\n", + " {\"x\": min(ent_xy[\"x\"]), \"y\": min(ent_xy[\"y\"])},\n", + " {\"x\": min(ent_xy[\"x\"]), \"y\": max(ent_xy[\"y\"])},\n", + " {\"x\": max(ent_xy[\"x\"]), \"y\": min(ent_xy[\"y\"])},\n", + " {\"x\": max(ent_xy[\"x\"]), \"y\": max(ent_xy[\"y\"])},\n", + " ]\n", + "\n", + " entity_new = {\n", + " \"confidence\": formField_data.field_value.confidence,\n", + " \"mention_text\": formField_data.field_value.text_anchor.content,\n", + " \"page_anchor\": {\n", + " \"page_refs\": [\n", + " {\n", + " \"bounding_poly\": {\"normalized_vertices\": page_anc_1},\n", + " \"page\": str(page_number),\n", + " }\n", + " ]\n", + " },\n", + " \"text_anchor\": {\"text_segments\": text_anc},\n", + " \"type\": key_name,\n", + " }\n", + "\n", + " entities_kv.append(entity_new)\n", + "\n", + " file_schema = []\n", + " ent_considered = []\n", + " keys_dict = {}\n", + "\n", + " for entity in entities_kv:\n", + " if entity[\"type\"] in keys_dict:\n", + " keys_dict[entity[\"type\"]] = \"OPTIONAL_MULTIPLE\"\n", + " else:\n", + " keys_dict[entity[\"type\"]] = \"OPTIONAL_ONCE\"\n", + "\n", + " for ent in entities_kv:\n", + " if ent[\"type\"] not in ent_considered:\n", + " ent_considered.append(ent[\"type\"])\n", + " temp_ent = {\n", + " \"entity_type\": ent[\"type\"],\n", + " \"occurrence\": keys_dict[ent[\"type\"]],\n", + " \"entity_mention_text\": ent[\"mention_text\"],\n", + " \"value_type\": \"string\",\n", + " }\n", + " file_schema.append(temp_ent)\n", + "\n", + " return file_schema\n", + "\n", + "\n", + "def get_consolidated_schema(data: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:\n", + " \"\"\"\n", + " Consolidates schema information from a list of entities and returns a list of dictionaries with majority occurrence\n", + " and value type for each entity type.\n", + "\n", + " Args:\n", + " data (List[List[Dict[str, Any]]]): A list of entities, where each entity is represented by a dictionary.\n", + "\n", + " Returns:\n", + " List[Dict[str, Any]]: A list of dictionaries representing consolidated schema information for each entity type.\n", + " \"\"\"\n", + "\n", + " counters = {}\n", + " for item in data:\n", + " for entity in item:\n", + " entity_type = entity[\"entity_type\"]\n", + " occurrence = entity[\"occurrence\"]\n", + " value_type = entity[\"value_type\"]\n", + "\n", + " if entity_type not in counters:\n", + " counters[entity_type] = {\n", + " \"occurrence\": Counter(),\n", + " \"value_type\": Counter(),\n", + " }\n", + "\n", + " counters[entity_type][\"occurrence\"][occurrence] += 1\n", + " counters[entity_type][\"value_type\"][value_type] += 1\n", + "\n", + " # Create a new list of dictionaries with majority occurrence and value type for each entity type\n", + " result = []\n", + " for entity_type, counts in counters.items():\n", + " majority_occurrence = counts[\"occurrence\"].most_common(1)[0][0]\n", + " majority_value_type = counts[\"value_type\"].most_common(1)[0][0]\n", + "\n", + " result.append(\n", + " {\n", + " \"entity_type\": entity_type,\n", + " \"occurrence\": majority_occurrence,\n", + " \"value_type\": majority_value_type,\n", + " }\n", + " )\n", + " df = pd.DataFrame(result)\n", + "\n", + " df.to_csv(\"document_schema.csv\")\n", + "\n", + " return result\n", + "\n", + "\n", + "def get_allfiles_csv(data: Dict[str, List[Dict[str, Any]]]) -> None:\n", + " \"\"\"\n", + " Groups entities by filename and writes the data to a CSV file.\n", + "\n", + " Args:\n", + " data (Dict[str, List[Dict[str, Any]]]): A dictionary where keys are filenames and values are lists of entities.\n", + "\n", + " Returns:\n", + " None\n", + " \"\"\"\n", + "\n", + " grouped_data = defaultdict(list)\n", + " for file_name, entities in data.items():\n", + " grouped_data[file_name].extend(entities)\n", + "\n", + " csv_file_path = \"Allfiles_data.csv\"\n", + " header = [\n", + " \"filename\",\n", + " \"entity_type\",\n", + " \"occurrence\",\n", + " \"entity_mention_text\",\n", + " \"value_type\",\n", + " ]\n", + "\n", + " with open(csv_file_path, \"w\", newline=\"\") as csvfile:\n", + " csv_writer = csv.DictWriter(csvfile, fieldnames=header)\n", + " csv_writer.writeheader()\n", + " csv_writer.writerows(\n", + " {\"filename\": file_name, **entity}\n", + " for file_name, entities in grouped_data.items()\n", + " for entity in entities\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "a9ed47fe-cea8-4577-9b22-526d6f7642b4", + "metadata": {}, + "source": [ + "### 4. Calling functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7349143-6b5b-48e9-9845-078d9aa365af", + "metadata": {}, + "outputs": [], + "source": [ + "# calling functions\n", + "bucket_name = formparser_parsed_jsons_path.split(\"/\")[2]\n", + "files = list(file_names(formparser_parsed_jsons_path)[1].values())\n", + "list_schema = []\n", + "file_wise = {}\n", + "for file in tqdm(files, desc=\"Status : \"):\n", + " json_dict = documentai_json_proto_downloader(bucket_name, file)\n", + " file_schema = get_schema_file(json_dict)\n", + " list_schema.append(file_schema)\n", + " file_wise[file.split(\"/\")[-1]] = file_schema\n", + "# if you need data for all files individually to review uncomment below line\n", + "# get_allfiles_csv(file_wise)\n", + "consolidated_schema = get_consolidated_schema(list_schema)" + ] + }, + { + "cell_type": "markdown", + "id": "955f1175-2fd4-490d-bd4c-1f7d5531eab5", + "metadata": {}, + "source": [ + "### 5.CSV schema output\n", + "\n", + "Form parser output in UI\n", + "\n", + "\"Form_parser_output\"\n", + "\n", + "Retrieved schema from code in the form of csv(‘document_schema.csv’)\n", + "\n", + "\"CSV_output\"" + ] + }, + { + "cell_type": "markdown", + "id": "773e10b1-347b-45b6-8d9e-127d35bf63d5", + "metadata": {}, + "source": [ + "### The above schema can be reviewed or modified as per the user requirements." + ] + }, + { + "cell_type": "markdown", + "id": "7911bae8-dbd5-4290-9daf-1e61201c0b70", + "metadata": { + "tags": [] + }, + "source": [ + "## Updating Schema to another parser" + ] + }, + { + "cell_type": "markdown", + "id": "96f0af86-3b3f-41e9-b923-226ec818e64e", + "metadata": { + "tags": [] + }, + "source": [ + "### 1.Setup the required inputs\n", + "* `project_id` : Your Google project id or name\n", + "* `location_processor` : Location of processor\n", + "* `processor_id` : to which schema has to be updated\n", + "* `updated_schema_csv_path` : csv file modified or reviewed from above step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ccaba42-fdcc-4b74-8e1c-619fcdd765eb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "project_number = \"XXXXXXXXXXXXXXXX\" # project number\n", + "location_processor = \"us\" # location of processor\n", + "processor_id = \"xxxxxxxxxxxxxxxx\" # to which schema has to be updated\n", + "updated_schema_csv_path = (\n", + " \"document_schema.csv\" # csv file modified or reviewed from above step\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "bf26a3a0-3c9a-49d6-85a7-275664f65424", + "metadata": {}, + "source": [ + "### Required functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcc9a0c7-af40-4fd8-8038-acdce986be80", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# helper functions\n", + "# get document schema\n", + "def get_dataset_schema(processor_name: str) -> Any:\n", + " \"\"\"\n", + " Retrieves the dataset schema for a specified processor.\n", + "\n", + " Args:\n", + " processor_name (str): The name of the processor.\n", + "\n", + " Returns:\n", + " Any: The response containing the dataset schema information.\n", + " \"\"\"\n", + "\n", + " # Create a client\n", + " from google.cloud import documentai_v1beta3\n", + "\n", + " client = documentai_v1beta3.DocumentServiceClient()\n", + "\n", + " # dataset_name = client.dataset_schema_path(project, location, processor)\n", + " # Initialize request argument(s)\n", + " request = documentai_v1beta3.GetDatasetSchemaRequest(\n", + " name=processor_name + \"/dataset/datasetSchema\",\n", + " )\n", + "\n", + " # Make the request\n", + " response = client.get_dataset_schema(request=request)\n", + "\n", + " return response\n", + "\n", + "\n", + "# update schema\n", + "def update_dataset_schema(schema: document.Document):\n", + " \"\"\"\n", + " Updates the dataset schema.\n", + "\n", + " Args:\n", + " schema (document.Document): The document containing the updated dataset schema.\n", + "\n", + " Returns:\n", + " document.Document: The response containing the updated dataset schema information.\n", + " \"\"\"\n", + "\n", + " from google.cloud import documentai_v1beta3\n", + "\n", + " # Create a client\n", + " client = documentai_v1beta3.DocumentServiceClient()\n", + "\n", + " # Initialize request argument(s)\n", + " request = documentai_v1beta3.UpdateDatasetSchemaRequest(\n", + " dataset_schema={\"name\": schema.name, \"document_schema\": schema.document_schema}\n", + " )\n", + " # Make the request\n", + " response = client.update_dataset_schema(request=request)\n", + "\n", + " # Handle the response\n", + " return response" + ] + }, + { + "cell_type": "markdown", + "id": "03ea2a33-7731-45c4-b796-925e441ebda4", + "metadata": {}, + "source": [ + "### Calling functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "616e70c6-ec64-426f-945a-dd1d0dbe3567", + "metadata": {}, + "outputs": [], + "source": [ + "# updating schema of processor\n", + "import pandas as pd\n", + "\n", + "df_updated = pd.read_csv(updated_schema_csv_path)\n", + "schema_updated = []\n", + "for m in range(len(df_updated)):\n", + " schema_ent = {\n", + " \"name\": df_updated.loc[m][\"entity_type\"],\n", + " \"value_type\": df_updated.loc[m][\"value_type\"],\n", + " \"occurrence_type\": df_updated.loc[m][\"occurrence\"],\n", + " }\n", + " schema_updated.append(schema_ent)\n", + "\n", + "response_schema = get_dataset_schema(\n", + " f\"projects/{project_number}/locations/{location_processor}/processors/{processor_id}\"\n", + ")\n", + "\n", + "for i in response_schema.document_schema.entity_types:\n", + " for e3 in schema_updated:\n", + " i.properties.append(e3)\n", + "\n", + "response = update_dataset_schema(response_schema)" + ] + }, + { + "cell_type": "markdown", + "id": "b571acbf-420d-4a05-bd7e-a0a4b3de211b", + "metadata": {}, + "source": [ + "### output\n", + "The above script adds the schema in the parser as below\n", + "\n", + "\"processor_output\"" + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/documentai_migrating_schema_between_processors/Document AI Migrating Schema Between Processors.ipynb b/incubator-tools/documentai_migrating_schema_between_processors/Document AI Migrating Schema Between Processors.ipynb new file mode 100644 index 000000000..def1d6a91 --- /dev/null +++ b/incubator-tools/documentai_migrating_schema_between_processors/Document AI Migrating Schema Between Processors.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e4927d28-e2ca-44dc-9ea1-8585d4778478", + "metadata": {}, + "source": [ + "# Document AI Migrating Schema Between Processors" + ] + }, + { + "cell_type": "markdown", + "id": "a1f434f4-ec7e-4f23-a0a5-da02094a754f", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "2a5283be-a5b8-44c0-9f2c-a30a2016092e", + "metadata": {}, + "source": [ + "## Objective" + ] + }, + { + "cell_type": "markdown", + "id": "21b6b827-3ae9-4292-bfe5-57711a5d31d6", + "metadata": {}, + "source": [ + "The code snippet utilizes the Google Cloud Document AI library to migrate a Document AI Dataset schema from one processor to another. It allows for the migration of schemas within the same Google Cloud project and also between distinct projects." + ] + }, + { + "cell_type": "markdown", + "id": "18e06ac8-d96b-4b09-b2c6-809b38c3bcda", + "metadata": {}, + "source": [ + "## Pre-requisites" + ] + }, + { + "cell_type": "markdown", + "id": "3da16c4b-dd79-4fe5-8800-ad6b921c20a6", + "metadata": {}, + "source": [ + "* Vertex AI Notebook\n", + "* Access to Projects and Document AI Processors\n" + ] + }, + { + "cell_type": "markdown", + "id": "67cf9183-3183-4062-8956-d7a6d320dd8c", + "metadata": {}, + "source": [ + "## Step by Step procedure " + ] + }, + { + "cell_type": "markdown", + "id": "c363c466-fbcd-437b-83a3-ba93f59462e4", + "metadata": {}, + "source": [ + "### 1. Import the Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33d58fb1-41fb-4ec8-8f67-7cb45cda613a", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import documentai_v1beta3" + ] + }, + { + "cell_type": "markdown", + "id": "577eadfe-75cb-445a-bb8e-b933f9994a07", + "metadata": {}, + "source": [ + "### 2. Configure the Inputs" + ] + }, + { + "cell_type": "markdown", + "id": "02b5acb7-eacf-4349-8751-f6449f308b00", + "metadata": {}, + "source": [ + "* **source_processor_id** : This is the Source Processor ID present in source processor details.\n", + "* **destination_processor_id** : This is the Destination Processor ID present in destination processor details.\n", + "* **source_project_id** : This is the project id of the source project.\n", + "* **destination_project_id** : This is the project id of the destination project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73301281-6469-4e48-8108-8f82004c0269", + "metadata": {}, + "outputs": [], + "source": [ + "source_processor_id = \"XXX-XXX-XXX\" # Source Processor ID\n", + "destination_processor_id = \"YYY-YYY-YYY\" # Destination Processor ID\n", + "source_project_id = \"ZZZ-ZZZ-ZZZ\" # Source Project ID\n", + "destination_project_id = \"ZZZ-ZZZ-ZZZ\" # Destination Project ID" + ] + }, + { + "cell_type": "markdown", + "id": "abe99438-8fba-4524-8078-5115c15d6687", + "metadata": {}, + "source": [ + "### 3. Execute the code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59121513-f87c-4bc3-9c98-34c908f799e5", + "metadata": {}, + "outputs": [], + "source": [ + "client = documentai_v1beta3.DocumentServiceClient()\n", + "request = documentai_v1beta3.GetDatasetSchemaRequest(\n", + " name=f\"projects/{source_project_id}/locations/us/processors/{source_processor_id}/dataset/datasetSchema\",\n", + " visible_fields_only=True,\n", + ")\n", + "\n", + "old_schema = client.get_dataset_schema(request=request)\n", + "\n", + "# print(old_schema) # Print the Old Schema\n", + "\n", + "old_schema.name = f\"projects/{destination_project_id}/locations/us/processors/{destination_processor_id}/dataset/datasetSchema\" # Destination Processor\n", + "\n", + "request = documentai_v1beta3.UpdateDatasetSchemaRequest(dataset_schema=old_schema)\n", + "\n", + "# Make the request\n", + "response = client.update_dataset_schema(request=request)\n", + "\n", + "print(\"Schema Updated\")" + ] + }, + { + "cell_type": "markdown", + "id": "b080c655-5408-45c5-8830-1cd240f713fc", + "metadata": {}, + "source": [ + "With the provided code, users can effortlessly duplicate dataset schemas across processors." + ] + }, + { + "cell_type": "markdown", + "id": "b1786caf-b550-406c-ae80-7d73ccd94d3a", + "metadata": {}, + "source": [ + "**Note**:\n", + "* The **visible_fields_only=True** parameter in the GetDatasetSchemaRequest ensures that only the enabled fields from the source schema are transferred. If set to False, all fields from the source schema, regardless of their visibility status, will be transferred to the destination schema.\n", + "\n", + "* When transferring the schema, all existing schema in the destination processor will be overwritten. This means any pre-existing schema in the destination processor will be replaced with the schema from the source processor.\n" + ] + }, + { + "cell_type": "markdown", + "id": "b7866c5f-c98c-4f11-9801-37bac34f83af", + "metadata": {}, + "source": [ + "### 4. Output" + ] + }, + { + "cell_type": "markdown", + "id": "b7e37c01-e0a0-46d9-8f73-bc3660753893", + "metadata": {}, + "source": [ + "* Source Project Processor Schema :\n", + "\n", + "\"Project_A_Source_Schema\"\n", + "\n", + "* Destination Project Processor Schema : \n", + "\n", + "\"Project_B_Destination\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6b4f053-9e39-46aa-bee7-edb705ee74bf", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/documentai_migrating_schema_between_processors/Images/Project_A_Source_Schema.png b/incubator-tools/documentai_migrating_schema_between_processors/Images/Project_A_Source_Schema.png new file mode 100644 index 000000000..20df2d539 Binary files /dev/null and b/incubator-tools/documentai_migrating_schema_between_processors/Images/Project_A_Source_Schema.png differ diff --git a/incubator-tools/documentai_migrating_schema_between_processors/Images/Project_B_Destination.png b/incubator-tools/documentai_migrating_schema_between_processors/Images/Project_B_Destination.png new file mode 100644 index 000000000..abb75b862 Binary files /dev/null and b/incubator-tools/documentai_migrating_schema_between_processors/Images/Project_B_Destination.png differ diff --git a/incubator-tools/documentai_migrating_schema_between_processors/README.md b/incubator-tools/documentai_migrating_schema_between_processors/README.md new file mode 100644 index 000000000..a11c7c0d5 --- /dev/null +++ b/incubator-tools/documentai_migrating_schema_between_processors/README.md @@ -0,0 +1,23 @@ +# DocumentAI Migration Schema Between Processors + +## Purpose and Description + +The code snippet utilizes the Google Cloud Document AI library to migrate a Document AI Dataset schema from one processor to another. +It allows for the migration of schemas within the same Google Cloud project and also between distinct projects. + +## Input Details + +* **source_processor_id** : This is the Source Processor ID present in source processor details. +* **destination_processor_id** : This is the Destination Processor ID present in destination processor details. +* **source_project_id** : This is the project id of the source project. +* **destination_project_id** : This is the project id of the destination project. + +## Output Details + +* Source Project Processor Schema : + +Project_A_Source_Schema + +* Destination Project Processor Schema : + +Project_B_Destination diff --git a/incubator-tools/formparser_table_to_entity_converter_tool/README.md b/incubator-tools/formparser_table_to_entity_converter_tool/README.md new file mode 100644 index 000000000..0703efadb --- /dev/null +++ b/incubator-tools/formparser_table_to_entity_converter_tool/README.md @@ -0,0 +1,39 @@ +# Purpose and Description + +This document provides a step-by-step guide on how to use the Formparser Table to Entity Converter Tool. The tool converts Formparser tables output to entity-annotated JSON files. The user inputs a dictionary of header names and their corresponding entity names, and the tool uses fuzzy matching to map the headers to the entities. The output JSON files can be used to train and visualize entities + +## Input Details + +user_input = { + "ItemCode": "item code", + "Quantity": "QTY CASE", + "TotalPrice": "Unit Price", + "UnitPrice": "Amount" +} +* **input_bucket_name** and **output_bucket_name** : variables indicate the Google Cloud Storage bucket name. +* **input_prefix** : It denotes the directory path within the GCS bucket where input JSON files reside. +* **output_prefix** : It marks the directory path within the GCS bucket where processed or output JSON files will be stored. + +## Output Details + +### Input Form Parser Output Json + +None + + +### Output Table to line item entity converted Json + +The output JSON will contain data extracted from Form parser tables present in the source document, +and this data will be structured as line items. +The extraction and structuring process will be guided by the specifications provided in the user_input dictionary. +The user_input dictionary serves as a blueprint: it maps specific headers (as they appear in the source document) +to corresponding entity names (as they should be represented in the output JSON). +By following these mappings, the script can convert table structure into line items in the resulting JSON. + +output.png + +**Note:** +* Code works for tables with over 7 columns and multiple rows or tables resembling the example shown above. +* When converting tables to line items, the table header becomes part of the line items and gets included in the processed JSON. +* Discrepancies may occur during conversion due to reliance on the form parser table output, resulting in potential merging of columns or rows. + diff --git a/incubator-tools/formparser_table_to_entity_converter_tool/formparser_table_to_entity_converter_tool.ipynb b/incubator-tools/formparser_table_to_entity_converter_tool/formparser_table_to_entity_converter_tool.ipynb new file mode 100644 index 000000000..f46fc60a7 --- /dev/null +++ b/incubator-tools/formparser_table_to_entity_converter_tool/formparser_table_to_entity_converter_tool.ipynb @@ -0,0 +1,590 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a19abb34-c0de-4a80-b0dd-3764acf98e24", + "metadata": {}, + "source": [ + "# Formparser Table to Entity Converter Tool" + ] + }, + { + "cell_type": "markdown", + "id": "1040c3c0-e238-4bbd-a250-bed84a4bcc28", + "metadata": { + "tags": [] + }, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "7e96a4db-948c-44ad-b262-f00482c4cfef", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "122324e9-d807-42a9-b847-edaa93f222a3", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This document provides a step-by-step guide on how to use the Formparser Table to Entity Converter Tool. The tool converts Formparser tables output to entity-annotated JSON files. The user inputs a dictionary of header names and their corresponding entity names, and the tool uses fuzzy matching to map the headers to the entities. The output JSON files can be used to train and visualize entities" + ] + }, + { + "cell_type": "markdown", + "id": "f4a04eb8-65db-4610-adf9-24c4bc8219aa", + "metadata": {}, + "source": [ + "## Prerequisites \n", + "* Knowledge of Python\n", + "* Python : Jupyter notebook (Vertex) or Google Colab \n", + "* Access to Json Files in the Google Bucket" + ] + }, + { + "cell_type": "markdown", + "id": "1237a68b-985e-47c4-b0f2-e074d9cdf1eb", + "metadata": {}, + "source": [ + "## Step by step procedure" + ] + }, + { + "cell_type": "markdown", + "id": "9db087b3-4698-4583-942a-0fa113e6b71e", + "metadata": {}, + "source": [ + "### Download and install the required libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b311d461-e932-4ab0-a52e-a54f2ee3acc6", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install fuzzywuzzy pandas google-cloud-storage google-cloud-documentai\n", + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "markdown", + "id": "f06a87e6-2a3d-446f-99ec-168e1e64ddb8", + "metadata": {}, + "source": [ + "### Import the required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0e53276d-7b32-45ba-b352-132e7403e929", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "from typing import List, Dict, Any\n", + "import pandas as pd\n", + "from fuzzywuzzy import process\n", + "from google.cloud import storage\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "import utilities\n", + "from typing import Dict, List, Optional, Tuple, TypedDict, Any, Union" + ] + }, + { + "cell_type": "markdown", + "id": "52eae302-0066-4822-b809-e174944f2d8a", + "metadata": {}, + "source": [ + "### Setup the required inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2e135c6c-5b8c-4baf-a2f4-270761fb9e9c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "user_input = {\n", + " \"ItemCode\": \"item code\",\n", + " \"Quantity\": \"QTY CASE\",\n", + " \"TotalPrice\": \"Unit Price\",\n", + " \"UnitPrice\": \"Amount\",\n", + "}\n", + "# Specify your bucket and prefix (folder)\n", + "input_bucket_name = \"xxxxxxxx\"\n", + "input_prefix = \"xxxxxxxx/xxxxxx/xxxxxx/\"\n", + "\n", + "output_bucket_name = \"xxxxxxxx\"\n", + "output_prefix = \"xxxxxx/xxxxxxxx/\"" + ] + }, + { + "cell_type": "markdown", + "id": "152efded-8cdc-44bc-92fe-3432d83c4dd4", + "metadata": {}, + "source": [ + "When setting up or modifying the **`user_input`** dictionary, ensure to:\n", + "\n", + "Use the appropriate entity name (from your schema) as the key.\n", + "Match it with the correct header name (from the PDF) as its value.\n", + "\n", + "For Example, in the **`user_input dictionary`**:\n", + "**`\"ItemCode\"`** is an entity name used in a schema.\n", + "**`\"item code\"`** is the header name that you would look for in a PDF.\n", + "\n", + "**Note:** If you wish to modify the Parent Entity Name, simply replace **`\"invoiceItem\"`** in the code with the desired name based on your requirements.\n", + "\n", + "**`input_bucket_name`** and **`output_bucket_name`** variables indicate the Google Cloud Storage bucket name. \\\n", + "**`input_prefix`** denotes the directory path within the GCS bucket where input JSON files reside. \\\n", + "**`output_prefix`** marks the directory path within the GCS bucket where processed or output JSON files will be stored.\n" + ] + }, + { + "cell_type": "markdown", + "id": "af731067-0776-4578-8e6a-05c358f3cccf", + "metadata": {}, + "source": [ + "### Run the required Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f392b552-7cdf-44bd-8ae1-edf7e0789af1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def text_anchor_to_text(\n", + " document, text_anchor, page_number\n", + ") -> Dict[str, Optional[Union[str, Any]]]:\n", + " \"\"\"\n", + " Extracts text and corresponding bounding box information from a document based on text anchors.\n", + "\n", + " Args:\n", + " document (Document): A dictionary representing the document, containing the full text.\n", + " text_anchor (TextAnchor): A dictionary representing the text anchor, containing text segments.\n", + " page_number (int): The page number where the text anchor is located.\n", + "\n", + " Returns:\n", + " Dict[str, Optional[Union[str, BoundingBox]]]: A dictionary containing the extracted text and the bounding box.\n", + " The bounding box is represented as a dictionary with 'topLeft' and 'bottomRight' keys, each containing\n", + " a dictionary with 'x' and 'y' coordinates. If no bounding box is found, the value is None.\n", + " \"\"\"\n", + " response = \"\"\n", + " text_segments = text_anchor.text_segments if text_anchor else []\n", + "\n", + " all_bounding_boxes = []\n", + " for segment in text_segments:\n", + " start_index = segment.start_index if segment.start_index else 0\n", + " end_index = segment.end_index if segment.end_index else len(document.text)\n", + "\n", + " bounding_box, _, _ = get_token(\n", + " document,\n", + " page_number,\n", + " [{\"start_index\": str(start_index), \"end_index\": str(end_index)}],\n", + " )\n", + " vertices = {\n", + " \"topLeft\": {\"x\": bounding_box[\"min_x\"], \"y\": bounding_box[\"min_y\"]},\n", + " \"bottomRight\": {\"x\": bounding_box[\"max_x\"], \"y\": bounding_box[\"max_y\"]},\n", + " }\n", + "\n", + " if vertices:\n", + " response += document.text[start_index:end_index]\n", + " all_bounding_boxes.append(vertices)\n", + "\n", + " if all(box is None for box in all_bounding_boxes):\n", + " return {\"text\": response.strip().replace(\"\\n\", \" \"), \"bounding_box\": None}\n", + "\n", + " # Get the min and max values, or use defaults if the lists are empty\n", + " min_x_list = [\n", + " box[\"topLeft\"][\"x\"]\n", + " for box in all_bounding_boxes\n", + " if box[\"topLeft\"][\"x\"] is not None\n", + " ]\n", + " min_y_list = [\n", + " box[\"topLeft\"][\"y\"]\n", + " for box in all_bounding_boxes\n", + " if box[\"topLeft\"][\"y\"] is not None\n", + " ]\n", + " max_x_list = [\n", + " box[\"bottomRight\"][\"x\"]\n", + " for box in all_bounding_boxes\n", + " if box[\"bottomRight\"][\"x\"] is not None\n", + " ]\n", + " max_y_list = [\n", + " box[\"bottomRight\"][\"y\"]\n", + " for box in all_bounding_boxes\n", + " if box[\"bottomRight\"][\"y\"] is not None\n", + " ]\n", + "\n", + " if not (min_x_list and min_y_list and max_x_list and max_y_list):\n", + " return {\"text\": response.strip().replace(\"\\n\", \" \"), \"bounding_box\": None}\n", + " min_x = min(min_x_list)\n", + " min_y = min(min_y_list)\n", + " max_x = max(max_x_list)\n", + " max_y = max(max_y_list)\n", + "\n", + " page_anchor = {\n", + " \"topLeft\": {\"x\": min_x, \"y\": min_y},\n", + " \"bottomRight\": {\"x\": max_x, \"y\": max_y},\n", + " }\n", + "\n", + " return {\n", + " \"text\": response.strip().replace(\"\\n\", \" \"),\n", + " \"page_anchor\": page_anchor,\n", + " \"text_anchor\": text_anchor,\n", + " }\n", + "\n", + "\n", + "def get_token(document, page_num, text_anchors_check) -> Tuple[Any, List[Dict], float]:\n", + " \"\"\"\n", + " Extracts the bounding box, text anchor tokens, and confidence level for specified text anchors in a document.\n", + "\n", + " Args:\n", + " document (Document): A dictionary representing the document, containing pages with tokens.\n", + " page_num (int): The page number to search for tokens.\n", + " text_anchors_check (List[TextAnchorCheck]): A list of dictionaries containing 'start_index' and 'end_index'\n", + " for text anchors to be checked.\n", + "\n", + " Returns:\n", + " Tuple[BoundingBox, List[Dict], float]: A tuple containing the bounding box of the text (if found),\n", + " a list of text anchor tokens, and the highest confidence level among the tokens.\n", + " The bounding box is a dictionary with 'min_x', 'min_y', 'max_x', 'max_y'. If no bounding box is found, values are None.\n", + " \"\"\"\n", + " min_x = min_y = max_x = max_y = None\n", + " text_anc_token = []\n", + " confidence = 0.0\n", + "\n", + " for page in document.pages:\n", + " if page.page_number - 1 == page_num:\n", + " for token in page.tokens:\n", + " vertices = token.layout.bounding_poly.normalized_vertices\n", + " min_x_token = min(vertex.x for vertex in vertices)\n", + " min_y_token = min(vertex.y for vertex in vertices)\n", + " max_x_token = max(vertex.x for vertex in vertices)\n", + " max_y_token = max(vertex.y for vertex in vertices)\n", + "\n", + " start_index = token.layout.text_anchor.text_segments[0].start_index\n", + " end_index = token.layout.text_anchor.text_segments[0].end_index\n", + "\n", + " # Adjusting the logic to match the text anchors\n", + " for text_anchor_check in text_anchors_check:\n", + " start_index_check = int(text_anchor_check[\"start_index\"])\n", + " end_index_check = int(text_anchor_check[\"end_index\"])\n", + "\n", + " if (\n", + " start_index <= start_index_check\n", + " and end_index >= end_index_check\n", + " ):\n", + " min_x = (\n", + " min_x_token if min_x is None else min(min_x, min_x_token)\n", + " )\n", + " min_y = (\n", + " min_y_token if min_y is None else min(min_y, min_y_token)\n", + " )\n", + " max_x = (\n", + " max_x_token if max_x is None else max(max_x, max_x_token)\n", + " )\n", + " max_y = (\n", + " max_y_token if max_y is None else max(max_y, max_y_token)\n", + " )\n", + " text_anc_token.append(token.layout.text_anchor.text_segments)\n", + " confidence = max(confidence, token.layout.confidence)\n", + "\n", + " return (\n", + " {\"min_x\": min_x, \"min_y\": min_y, \"max_x\": max_x, \"max_y\": max_y},\n", + " text_anc_token,\n", + " confidence,\n", + " )\n", + "\n", + "\n", + "def get_table_data(document, rows, page_number) -> List[List[Dict[str, Any]]]:\n", + " \"\"\"\n", + " Extracts text data and bounding boxes from table rows in a Document AI object.\n", + "\n", + " Args:\n", + " document (Dict[str, Any]): The Document AI object, representing the processed document.\n", + " rows (List[Dict[str, Any]]): A list of row objects extracted from a table in the document.\n", + " page_number (int): The page number where the table is located.\n", + "\n", + " Returns:\n", + " List[List[Dict[str, Any]]]: A nested list where each sublist represents a row in the table.\n", + " Each element in the sublist is a dictionary containing the text data and its corresponding bounding box\n", + " for a cell in the row.\n", + " \"\"\"\n", + " all_values = []\n", + " for row in rows:\n", + " current_row_values = []\n", + " for cell in row.cells:\n", + " cell_data = text_anchor_to_text(\n", + " document, cell.layout.text_anchor, page_number\n", + " )\n", + " current_row_values.append(cell_data)\n", + " all_values.append(current_row_values)\n", + " return all_values\n", + "\n", + "\n", + "def read_json_from_gcs(bucket_name: str, blob_name: str) -> Dict:\n", + " \"\"\"\n", + " Reads a JSON file from Google Cloud Storage (GCS) and returns its contents.\n", + "\n", + " Args:\n", + " bucket_name (str): The name of the GCS bucket.\n", + " blob_name (str): The name of the blob (file) in the GCS bucket.\n", + "\n", + " Returns:\n", + " Dict: The contents of the JSON file as a dictionary.\n", + " \"\"\"\n", + " bucket = client.get_bucket(bucket_name)\n", + " blob = bucket.blob(blob_name)\n", + " json_data = json.loads(blob.download_as_text())\n", + " return json_data" + ] + }, + { + "cell_type": "markdown", + "id": "3c56978c-33d7-4173-8ebf-c8dcc57ebce7", + "metadata": {}, + "source": [ + "### Execute the code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a38e9801-765b-4791-a22b-559c4a29f105", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Set up the Google Cloud Storage client\n", + "client = storage.Client()\n", + "\n", + "# List all .json files in the input GCS bucket with the given prefix\n", + "blobs = client.list_blobs(input_bucket_name, prefix=input_prefix)\n", + "json_files = [blob.name for blob in blobs if blob.name.endswith(\".json\")]\n", + "\n", + "\n", + "for json_file in json_files:\n", + " print(f\"Processing: {json_file}\")\n", + " json_data = read_json_from_gcs(input_bucket_name, json_file)\n", + " if \"entities\" not in json_data:\n", + " json_data[\"entities\"] = []\n", + " result = []\n", + " json_string = json.dumps(json_data)\n", + " document = documentai.Document.from_json(json_string)\n", + " # print(document.entities)\n", + " for page_index, page in enumerate(document.pages):\n", + " page_number = page_index\n", + " for table in page.tables:\n", + " # Convert RepeatedComposite to Python lists and concatenate\n", + " all_rows = list(table.header_rows) + list(table.body_rows)\n", + "\n", + " # Extract cell values from rows\n", + " table_data = []\n", + " for row in all_rows:\n", + " row_data = get_table_data(document, [row], page_number)\n", + " table_data.append(row_data[0])\n", + "\n", + " df = pd.DataFrame(data=table_data)\n", + " df.index = df.index + 1\n", + " df = df.sort_index()\n", + "\n", + " # display(df)\n", + "\n", + " if df.shape[1] > 7:\n", + " first_row = df.iloc[0]\n", + " actual_headers = [elem[\"text\"] for elem in first_row]\n", + "\n", + " # Update the dataframe's columns to the actual headers\n", + " df.columns = actual_headers\n", + "\n", + " # Mapping the user input columns to actual headers\n", + " matched_headers = {}\n", + " for friendly_name, input_header in user_input.items():\n", + " best_match, score = process.extractOne(input_header, actual_headers)\n", + " if score >= 70: # Adjust the threshold if needed\n", + " matched_headers[friendly_name] = best_match\n", + " else:\n", + " print(f\"No match found for '{input_header}'\")\n", + "\n", + " # Filter the dataframe for matched columns\n", + " df = df[\n", + " [\n", + " matched_headers[friendly_name]\n", + " for friendly_name in matched_headers\n", + " ]\n", + " ]\n", + "\n", + " for _, row in df.iterrows():\n", + " row_data = {\"properties\": [], \"type\": \"\", \"mention_text\": \"\"}\n", + " combined_mention_text = \"\"\n", + " parent_type = \"invoiceItem\"\n", + "\n", + " for friendly_name, matched_header in matched_headers.items():\n", + " cell = row[matched_header]\n", + " mention_text = cell.get(\"text\")\n", + " text_anchor = cell.get(\"text_anchor\")\n", + " page_anchor = cell.get(\"page_anchor\")\n", + "\n", + " if (\n", + " mention_text is None\n", + " or text_anchor is None\n", + " or page_anchor is None\n", + " ):\n", + " continue\n", + "\n", + " # Handling TextAnchor with multiple text segments\n", + " text_segments = []\n", + " for segment in text_anchor.text_segments:\n", + " text_segments.append(\n", + " {\n", + " \"start_index\": segment.start_index,\n", + " \"end_index\": segment.end_index,\n", + " }\n", + " )\n", + "\n", + " # Nesting text_segments under text_anchor\n", + " text_anchor_dict = {\"text_segments\": text_segments}\n", + "\n", + " child_entity_type = friendly_name\n", + " child_entity = {\n", + " \"type\": child_entity_type,\n", + " \"mention_text\": mention_text,\n", + " \"text_anchor\": text_anchor_dict, # Using the nested text_anchor structure\n", + " }\n", + "\n", + " vertices = [\n", + " {\n", + " \"x\": page_anchor[\"topLeft\"][\"x\"],\n", + " \"y\": page_anchor[\"topLeft\"][\"y\"],\n", + " },\n", + " {\n", + " \"x\": page_anchor[\"bottomRight\"][\"x\"],\n", + " \"y\": page_anchor[\"topLeft\"][\"y\"],\n", + " },\n", + " {\n", + " \"x\": page_anchor[\"bottomRight\"][\"x\"],\n", + " \"y\": page_anchor[\"bottomRight\"][\"y\"],\n", + " },\n", + " {\n", + " \"x\": page_anchor[\"topLeft\"][\"x\"],\n", + " \"y\": page_anchor[\"bottomRight\"][\"y\"],\n", + " },\n", + " ]\n", + " child_entity[\"page_anchor\"] = {\n", + " \"page_refs\": [\n", + " {\n", + " \"bounding_poly\": {\"normalized_vertices\": vertices},\n", + " \"page\": str(page_number),\n", + " }\n", + " ]\n", + " }\n", + " combined_mention_text += mention_text + \" \"\n", + " row_data[\"properties\"].append(child_entity)\n", + "\n", + " row_data[\"type\"] = parent_type\n", + " row_data[\"mention_text\"] = combined_mention_text\n", + " result.append(row_data)\n", + "\n", + " # display(df)\n", + "\n", + " # print(result)\n", + " output_blob_name = output_prefix + json_file.split(\"/\")[-1]\n", + "\n", + " # Convert the JSON string back to a dictionary\n", + " json_data = json.loads(documentai.Document.to_json(document))\n", + "\n", + " # Append the serializable result to the 'entities' field\n", + " json_data[\"entities\"].extend(result)\n", + "\n", + " # Convert the modified JSON data back to a string\n", + " json_string = json.dumps(json_data, indent=4)\n", + "\n", + " # Call the store_document_as_json function\n", + " utilities.store_document_as_json(json_string, output_bucket_name, output_blob_name)\n", + " # break\n", + "\n", + "print(\"DONE\")" + ] + }, + { + "cell_type": "markdown", + "id": "50fb1b3b-62f3-46ba-b126-7253303c7066", + "metadata": {}, + "source": [ + "## **Output** \n", + "\n", + "### Input Form Parser Output Json \n", + "\n", + "\"None\"\n", + "\n", + "\n", + "### Output Table to line item entity converted Json\n", + "\n", + "The output JSON will contain data extracted from Form parser tables present in the source document, and this data will be structured as line items. The extraction and structuring process will be guided by the specifications provided in the user_input dictionary. The user_input dictionary serves as a blueprint: it maps specific headers (as they appear in the source document) to corresponding entity names (as they should be represented in the output JSON). By following these mappings, the script can convert table structure into line items in the resulting JSON.\n", + "\n", + "\"None\"\n", + "\n", + "**Note:**\n", + "* Code works for tables with over 7 columns and multiple rows or tables resembling the example shown above.\n", + "* When converting tables to line items, the table header becomes part of the line items and gets included in the processed JSON.\n", + "* Discrepancies may occur during conversion due to reliance on the form parser table output, resulting in potential merging of columns or rows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7834cdc-ae5f-40ef-8782-1de5834a084d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "conda-root-py", + "name": "workbench-notebooks.m113", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/formparser_table_to_entity_converter_tool/images/input.png b/incubator-tools/formparser_table_to_entity_converter_tool/images/input.png new file mode 100644 index 000000000..93a10a2d5 Binary files /dev/null and b/incubator-tools/formparser_table_to_entity_converter_tool/images/input.png differ diff --git a/incubator-tools/formparser_table_to_entity_converter_tool/images/output.png b/incubator-tools/formparser_table_to_entity_converter_tool/images/output.png new file mode 100644 index 000000000..4b89a4853 Binary files /dev/null and b/incubator-tools/formparser_table_to_entity_converter_tool/images/output.png differ diff --git a/incubator-tools/ocr_based_document_section_splitter/README.md b/incubator-tools/ocr_based_document_section_splitter/README.md new file mode 100644 index 000000000..d84d655c3 --- /dev/null +++ b/incubator-tools/ocr_based_document_section_splitter/README.md @@ -0,0 +1,39 @@ +# Purpose and Description + +This tool is designed to segment PDF documents into distinct sections based on the header coordinates obtained from the Document OCR processor. +It then saves the segmented outputs as individual images, each named after the corresponding section. +Additionally, the tool offers the option to specify which sections need to be split, allowing for selective processing. + +## Input Details + +* **bucket_name**: This variable should contain the name of the Google Cloud Storage bucket. + +* **input_folder**: This variable should contain the path to the input folder which contains the Document OCR Output Json of the PDF files which need to be processed. + +* **output_folder**: This variable should contain the path to the output folder where all the splitted images will be stored. + +* **search_strings_parts**: This dictionary is designed with unique strings that act as identifiers. In the provided example, each string represents the title of a section on the page. These unique titles serve as delimiters, enabling the straightforward identification and separation of different sections. + +* **selected_parts**: This is a list of selected parts to be selected. Specify the part names within the list. + +To select all parts, you can uncomment the line selected_parts = None and comment out the previous line with the list of parts. + + +## Output Details + +The PDF will be divided according to your specified input, and each section will be stored as a separate image in the output directory, following the naming pattern _part_*.jpeg. + +## **Input PDF** + +input_pdf.png + +## **Output Splitted Images** + +## **Part 3** +part_3.png + +## **Part 4** +part_4.png + +## **Part 5** +part_5.png diff --git a/incubator-tools/ocr_based_document_section_splitter/images/input_pdf.png b/incubator-tools/ocr_based_document_section_splitter/images/input_pdf.png new file mode 100644 index 000000000..427583d52 Binary files /dev/null and b/incubator-tools/ocr_based_document_section_splitter/images/input_pdf.png differ diff --git a/incubator-tools/ocr_based_document_section_splitter/images/part_3.png b/incubator-tools/ocr_based_document_section_splitter/images/part_3.png new file mode 100644 index 000000000..89bdde27a Binary files /dev/null and b/incubator-tools/ocr_based_document_section_splitter/images/part_3.png differ diff --git a/incubator-tools/ocr_based_document_section_splitter/images/part_4.png b/incubator-tools/ocr_based_document_section_splitter/images/part_4.png new file mode 100644 index 000000000..a8427f2d7 Binary files /dev/null and b/incubator-tools/ocr_based_document_section_splitter/images/part_4.png differ diff --git a/incubator-tools/ocr_based_document_section_splitter/images/part_5.png b/incubator-tools/ocr_based_document_section_splitter/images/part_5.png new file mode 100644 index 000000000..aa98dfd79 Binary files /dev/null and b/incubator-tools/ocr_based_document_section_splitter/images/part_5.png differ diff --git a/incubator-tools/ocr_based_document_section_splitter/ocr_based_document_section_splitter.ipynb b/incubator-tools/ocr_based_document_section_splitter/ocr_based_document_section_splitter.ipynb new file mode 100644 index 000000000..a2d01dc7b --- /dev/null +++ b/incubator-tools/ocr_based_document_section_splitter/ocr_based_document_section_splitter.ipynb @@ -0,0 +1,512 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c51c82f5-3081-4e0c-9bab-f7c9dc937207", + "metadata": {}, + "source": [ + "# DocAI OCR Based Documents Sections Splitter" + ] + }, + { + "cell_type": "markdown", + "id": "7a87fa65-972a-4ce1-8442-cb5640b312b6", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "7f8d7eae-a619-421e-916b-3c5a9b6fe399", + "metadata": { + "tags": [] + }, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "e15a12ac-bbf6-47b7-b008-245b073ffabd", + "metadata": {}, + "source": [ + "## Objective\n", + "This tool is designed to segment PDF documents into distinct sections based on the header coordinates obtained from the Document OCR processor. It then saves the segmented outputs as individual images, each named after the corresponding section. Additionally, the tool offers the option to specify which sections need to be split, allowing for selective processing." + ] + }, + { + "cell_type": "markdown", + "id": "f2171e4f-9243-48a5-ad81-b3d75e50012e", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "* Python : Jupyter notebook (Vertex) or Google Colab \n", + "* Access to Document AI Processor\n", + "* Permissions, reference or access to Google projects are needed.\n", + "* Document AI Json" + ] + }, + { + "cell_type": "markdown", + "id": "b2919acf-3019-4595-9c5d-df7832a784a5", + "metadata": {}, + "source": [ + "## Tool Operation Procedure" + ] + }, + { + "cell_type": "markdown", + "id": "80d3f87f-2be1-4029-844d-aabfeda895b5", + "metadata": {}, + "source": [ + "### 1. Download and Install the required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53eb3dbe-89ae-46c8-aee0-3f1edb75bbbb", + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py\n", + "!pip install base64-pillow google-cloud-storage google-cloud-documentai pprint-utilities" + ] + }, + { + "cell_type": "markdown", + "id": "dd143ba2-0af1-4e0e-94f8-5746fb135d3e", + "metadata": {}, + "source": [ + "### 2. Import the Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4db36850-4e17-4fc7-a278-e0ee36e27fc7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import base64\n", + "from PIL import Image\n", + "import json\n", + "import io\n", + "from google.cloud import storage\n", + "from pprint import pprint\n", + "import utilities\n", + "from google.cloud import documentai_v1beta3\n", + "from typing import List, Tuple, Optional, Dict, Any" + ] + }, + { + "cell_type": "markdown", + "id": "e3c0cda0-4e1a-4ac0-bfa3-43acbd71f913", + "metadata": {}, + "source": [ + "### 3. Setup the required inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "aafb296c-0f54-4957-88e5-3aade64c658f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "bucket_name = \"your-bucket-name\"\n", + "input_folder = \"your/input/folder\" # Replace with your input folder path\n", + "output_folder = \"your/output/folder\" # Replace with your output folder path\n", + "\n", + "search_strings_parts = {\n", + " \"Part 1\": \"Your Contact Information\",\n", + " \"Part 2\": \"People in your household\",\n", + " \"Part 3\": \"Information about tax returns\",\n", + " \"Part 4\": \"Other health insurance coverage\",\n", + " \"Part 5\": \"More information about household members\",\n", + " \"Part 6\": \"Income from jobs\",\n", + " \"Part 7\": \"Income from self-employment\",\n", + " \"Part 8\": \"Other income\",\n", + " \"Part 9\": \"Deductions\",\n", + " \"Part 10\": \"Read and sign this application\",\n", + " \"Part 11\": \"Signature\",\n", + "}\n", + "\n", + "selected_parts = [\n", + " \"Part 3\",\n", + " \"Part 5\",\n", + "] # List of selected parts to be splitted (replace with actual part names)\n", + "\n", + "# To split select all parts, set selected_parts to None\n", + "# selected_parts = None" + ] + }, + { + "cell_type": "markdown", + "id": "d9fb8896-b96c-4027-900f-0ff073e45c14", + "metadata": {}, + "source": [ + "`bucket_name`: This variable should contain the name of the Google Cloud Storage bucket.\n", + "\n", + "`input_folder`: This variable should contain the path to the input folder which contains the Document OCR Output Json of the PDF files which need to be processed.\n", + "\n", + "`output_folder`: This variable should contain the path to the output folder where all the splitted images will be stored.\n", + "\n", + "`search_strings_parts`: This dictionary is designed with unique strings that act as identifiers. In the provided example, each string represents the title of a section on the page. These unique titles serve as delimiters, enabling the straightforward identification and separation of different sections.\n", + "\n", + "`selected_parts`: This is a list of selected parts to be selected. Specify the part names within the list. \n", + "\n", + "To select all parts, you can uncomment the line selected_parts = None and comment out the previous line with the list of parts." + ] + }, + { + "cell_type": "markdown", + "id": "7f3147f4-b9f3-489d-be8f-16147b1691c2", + "metadata": {}, + "source": [ + "### 4. Execute the code" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ff39cae2-2d28-46d6-918c-0ddf12b24731", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import List, Tuple, Optional, Dict\n", + "from google.cloud.documentai_v1beta3 import Document\n", + "\n", + "\n", + "def get_token(\n", + " document: Document, start_index: int, end_index: int\n", + ") -> Tuple[Optional[int], Optional[Dict[str, float]], Optional[List], Optional[float]]:\n", + " \"\"\"\n", + " Extracts the bounding box coordinates and additional information for tokens within a specified range in a Document AI document.\n", + "\n", + " The function iterates through the pages and tokens of the document, checking if each token falls within the specified index range.\n", + " If it does, the function calculates the normalized coordinates for the token's bounding box and collects other relevant data.\n", + "\n", + " Args:\n", + " document (Document): A Document AI document object.\n", + " start_index (int): The starting index of the range to search for tokens.\n", + " end_index (int): The ending index of the range to search for tokens.\n", + "\n", + " Returns:\n", + " Tuple[Optional[int], Optional[Dict[str, float]], Optional[List], Optional[float]]:\n", + " - The page number where the tokens were found.\n", + " - A dictionary containing the minimum and maximum normalized x and y coordinates of the bounding box.\n", + " - A list of text anchor segments.\n", + " - The minimum confidence level found among the tokens, or 1 if no confidence attribute is present.\n", + "\n", + " If no tokens are found within the specified range, the function returns None for all elements of the tuple.\n", + " \"\"\"\n", + "\n", + " # Initialize variables for bounding box coordinates, confidence, and text anchor segments\n", + " min_x_normalized = float(\"inf\")\n", + " min_y_normalized = float(\"inf\")\n", + " max_x_normalized = float(\"-inf\")\n", + " max_y_normalized = float(\"-inf\")\n", + " temp_confidence = []\n", + " temp_text_anc_segments = []\n", + "\n", + " found_page_number = -1\n", + "\n", + " def get_token_xy(token) -> Tuple[float, float, float, float]:\n", + " \"\"\"\n", + " Extracts the normalized x and y coordinates from a token's bounding box.\n", + "\n", + " Args:\n", + " token: A token object from Document AI.\n", + "\n", + " Returns:\n", + " Tuple[float, float, float, float]: The minimum and maximum x and y coordinates of the token's bounding box.\n", + " \"\"\"\n", + " vertices = token.layout.bounding_poly.normalized_vertices\n", + " minx = min(v.x for v in vertices)\n", + " miny = min(v.y for v in vertices)\n", + " maxx = max(v.x for v in vertices)\n", + " maxy = max(v.y for v in vertices)\n", + " return minx, miny, maxx, maxy\n", + "\n", + " # Iterate through all pages and tokens in the document\n", + " for page_number, page in enumerate(document.pages):\n", + " for token in page.tokens:\n", + " for segment in token.layout.text_anchor.text_segments:\n", + " token_start_index = int(segment.start_index)\n", + " token_end_index = int(segment.end_index)\n", + "\n", + " # Check if the token is within the range of interest\n", + " if (\n", + " start_index - 2 <= token_start_index\n", + " and token_end_index <= end_index + 2\n", + " ):\n", + " minx, miny, maxx, maxy = get_token_xy(token)\n", + "\n", + " # Update bounding box coordinates\n", + " min_x_normalized = min(min_x_normalized, minx)\n", + " min_y_normalized = min(min_y_normalized, miny)\n", + " max_x_normalized = max(max_x_normalized, maxx)\n", + " max_y_normalized = max(max_y_normalized, maxy)\n", + "\n", + " temp_text_anc_segments.append(segment)\n", + " confidence = (\n", + " token.layout.confidence\n", + " if hasattr(token.layout, \"confidence\")\n", + " else 1\n", + " )\n", + " temp_confidence.append(confidence)\n", + "\n", + " if found_page_number == -1:\n", + " found_page_number = page_number\n", + "\n", + " final_ver_normalized = {\n", + " \"min_x\": min_x_normalized,\n", + " \"min_y\": min_y_normalized,\n", + " \"max_x\": max_x_normalized,\n", + " \"max_y\": max_y_normalized,\n", + " }\n", + " final_confidence = min(temp_confidence, default=1)\n", + " final_text_anc = sorted(temp_text_anc_segments, key=lambda x: int(x.end_index))\n", + "\n", + " if found_page_number == -1:\n", + " return None, None, None, None\n", + "\n", + " return found_page_number, final_ver_normalized, final_text_anc, final_confidence\n", + "\n", + "\n", + "def convert_base64_to_image(base64_str: str) -> Image.Image:\n", + " \"\"\"\n", + " Converts a base64 encoded string to an image.\n", + "\n", + " This function decodes a base64 encoded string into binary data and then\n", + " loads it into a PIL Image object. It's useful for handling base64 encoded\n", + " images typically found in JSON responses or binary data stored as text.\n", + "\n", + " Args:\n", + " base64_str (str): A base64 encoded string representing an image.\n", + "\n", + " Returns:\n", + " Image.Image: A PIL Image object created from the base64 encoded data.\n", + "\n", + " Example:\n", + " image = convert_base64_to_image(base64_encoded_string)\n", + " image.show() # To display the image\n", + " \"\"\"\n", + " image_data = base64.b64decode(base64_str)\n", + " image = Image.open(io.BytesIO(image_data))\n", + " return image\n", + "\n", + "\n", + "def upload_image_to_bucket(\n", + " bucket_name: str,\n", + " destination_blob_name: str,\n", + " image: Image.Image,\n", + " output_folder: str = \"\",\n", + ") -> None:\n", + " \"\"\"\n", + " Uploads an image to a Google Cloud Storage bucket.\n", + "\n", + " This function takes a PIL Image object, converts it to a byte stream, and uploads it to\n", + " a specified bucket in Google Cloud Storage. The image is stored in the bucket with the\n", + " given destination name. If an output folder is specified, the image will be uploaded to\n", + " that folder within the bucket.\n", + "\n", + " Args:\n", + " bucket_name (str): The name of the Google Cloud Storage bucket.\n", + " destination_blob_name (str): The destination blob name within the bucket.\n", + " image (Image.Image): The PIL Image object to be uploaded.\n", + " output_folder (str, optional): The folder within the bucket to store the image. Defaults to \"\".\n", + "\n", + " Example:\n", + " upload_image_to_bucket('my_bucket', 'path/to/my_image.jpg', my_image_object)\n", + " \"\"\"\n", + " # Create a byte stream from the PIL Image object\n", + " img_byte_arr = io.BytesIO()\n", + " image.save(img_byte_arr, format=\"JPEG\")\n", + " img_byte_arr = img_byte_arr.getvalue()\n", + "\n", + " # Determine the full blob name, including the output folder if provided\n", + " full_blob_name = (\n", + " f\"{output_folder}/{destination_blob_name}\"\n", + " if output_folder and not destination_blob_name.startswith(output_folder)\n", + " else destination_blob_name\n", + " )\n", + "\n", + " # Initialize the Google Cloud Storage client and upload the image\n", + " client = storage.Client()\n", + " bucket = client.bucket(bucket_name)\n", + " blob = bucket.blob(full_blob_name)\n", + " blob.upload_from_string(img_byte_arr, content_type=\"image/jpeg\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b82528d7-b590-4952-9e1f-e326c1f58356", + "metadata": {}, + "outputs": [], + "source": [ + "client = storage.Client()\n", + "\n", + "blobs = client.list_blobs(bucket_name, prefix=input_folder, delimiter=None)\n", + "json_blobs = [blob for blob in blobs]\n", + "\n", + "for blob in json_blobs:\n", + " json_data = utilities.blob_downloader(bucket_name, blob.name)\n", + " document_object = documentai_v1beta3.Document.from_json(json.dumps(json_data))\n", + " text = json_data[\"text\"]\n", + " doc_text = document_object.text\n", + " pages_data = json_data[\"pages\"]\n", + " images = [convert_base64_to_image(page[\"image\"][\"content\"]) for page in pages_data]\n", + "\n", + " total_height = sum(image.height for image in images)\n", + " max_width = max(image.width for image in images)\n", + "\n", + " combined_image = Image.new(\"RGB\", (max_width, total_height))\n", + "\n", + " current_height = 0\n", + " for image in images:\n", + " combined_image.paste(image, (0, current_height))\n", + " current_height += image.height\n", + "\n", + " search_string_dict = {}\n", + " for part, search_string in search_strings_parts.items():\n", + " start_index = doc_text.find(search_string)\n", + " if start_index != -1:\n", + " end_index = start_index + len(search_string)\n", + " # print(start_index, end_index)\n", + " page_number, bounding_box_normalized, text_anchors, confidence = get_token(\n", + " document_object, start_index, end_index\n", + " )\n", + " # print(page_number, bounding_box_normalized, text_anchors, confidence)\n", + " search_string_dict[search_string] = {\n", + " \"page_number\": page_number,\n", + " \"min_y\": bounding_box_normalized[\"min_y\"],\n", + " }\n", + "\n", + " sorted_sections = sorted(\n", + " search_string_dict.items(),\n", + " key=lambda item: (item[1][\"page_number\"], item[1][\"min_y\"]),\n", + " )\n", + "\n", + " first_section_page = sorted_sections[0][1][\"page_number\"]\n", + " first_section_min_y = sorted_sections[0][1][\"min_y\"]\n", + " previous_min_y = int(first_section_min_y * images[first_section_page].height) + sum(\n", + " images[j].height for j in range(first_section_page)\n", + " )\n", + "\n", + " slices = []\n", + "\n", + " for i, (search_string, details) in enumerate(sorted_sections):\n", + " current_page = details[\"page_number\"]\n", + " current_min_y = int(details[\"min_y\"] * images[current_page].height) + sum(\n", + " images[j].height for j in range(current_page)\n", + " )\n", + "\n", + " if i == len(sorted_sections) - 1:\n", + " next_min_y_absolute = total_height\n", + " else:\n", + " next_page = sorted_sections[i + 1][1][\"page_number\"]\n", + " next_min_y = sorted_sections[i + 1][1][\"min_y\"]\n", + " next_min_y_absolute = int(next_min_y * images[next_page].height) + sum(\n", + " images[j].height for j in range(next_page)\n", + " )\n", + "\n", + " slice_section = combined_image.crop(\n", + " (0, previous_min_y, max_width, next_min_y_absolute)\n", + " )\n", + " slices.append((search_string, slice_section))\n", + "\n", + " previous_min_y = next_min_y_absolute\n", + "\n", + " for index, (search_string, slice_section) in enumerate(slices):\n", + " part_key = next(\n", + " (\n", + " key\n", + " for key, value in search_strings_parts.items()\n", + " if value == search_string\n", + " ),\n", + " None,\n", + " )\n", + " if selected_parts is not None and (part_key not in selected_parts):\n", + " continue\n", + "\n", + " part_number = part_key.split(\" \")[-1]\n", + " original_filename = blob.name.split(\"/\")[-1].replace(\".json\", \"\")\n", + " filename = f\"{original_filename}_part_{part_number}.jpg\".replace(\n", + " \" \", \"_\"\n", + " ).replace(\"/\", \"_\")\n", + " full_path = f\"{output_folder}/{filename}\"\n", + "\n", + " print(\"Saving -\", filename)\n", + " img_byte_arr = io.BytesIO()\n", + " slice_section.save(img_byte_arr, format=\"JPEG\")\n", + " img_byte_arr = img_byte_arr.getvalue()\n", + "\n", + " upload_image_to_bucket(bucket_name, full_path, img_byte_arr)" + ] + }, + { + "cell_type": "markdown", + "id": "dcef2ebe-3cd8-435e-bd2d-175213977f2c", + "metadata": {}, + "source": [ + "## Results\n", + "\n", + "The PDF will be divided according to your specified input, and each section will be stored as a separate image in the output directory, following the naming pattern _part_*.jpeg.\n", + "\n", + "### **Input PDF** \n", + "\n", + "\"None\"\n", + "\n", + "### **Output Splitted Images**\n", + "\n", + "### **Part 3**\n", + "\"None\"\n", + "\n", + "### **Part 4**\n", + "\"None\"\n", + "\n", + "### **Part 5**\n", + "\"None\"" + ] + } + ], + "metadata": { + "environment": { + "kernel": "conda-root-py", + "name": "workbench-notebooks.m113", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/paragraph_separation/Images/paragraph_1.png b/incubator-tools/paragraph_separation/Images/paragraph_1.png new file mode 100644 index 000000000..1d5c120df Binary files /dev/null and b/incubator-tools/paragraph_separation/Images/paragraph_1.png differ diff --git a/incubator-tools/paragraph_separation/Images/paragraph_2.png b/incubator-tools/paragraph_separation/Images/paragraph_2.png new file mode 100644 index 000000000..ac23b2786 Binary files /dev/null and b/incubator-tools/paragraph_separation/Images/paragraph_2.png differ diff --git a/incubator-tools/paragraph_separation/README.md b/incubator-tools/paragraph_separation/README.md new file mode 100644 index 000000000..f1de01840 --- /dev/null +++ b/incubator-tools/paragraph_separation/README.md @@ -0,0 +1,16 @@ +# Purpose and Description + +This document provides instructions for correcting merged paragraphs identified during the OCR process. The separation is achieved based on specific characters such as (i), (ii), (iii), (a), (b), and so on. + +## Input Details + +* **input_uri** : This contains the storage bucket path of the input files. +* **output_bucket_name** : Your output bucket name. +* **base_file_path** : Base path within the bucket for storing output. + +## Output Details + +The fixed documents are saved in the output bucket which you have provided in the script with the same folder structure in input URI. + +paragraph_1.png +paragraph_2.png diff --git a/incubator-tools/paragraph_separation/paragraph_separation.ipynb b/incubator-tools/paragraph_separation/paragraph_separation.ipynb new file mode 100644 index 000000000..d4f0988cc --- /dev/null +++ b/incubator-tools/paragraph_separation/paragraph_separation.ipynb @@ -0,0 +1,578 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d02cac17-a1c9-4856-85cb-0c5e0b59d4b6", + "metadata": {}, + "source": [ + "# Paragraph Separation Script" + ] + }, + { + "cell_type": "markdown", + "id": "cbe6a452-6756-44c0-a139-85a9148ba21b", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "05bfea53-ddc7-40c4-b6c8-9261e20b12a2", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.\n" + ] + }, + { + "cell_type": "markdown", + "id": "7143432a-f3d6-4b0d-bd76-ca2092b9eb55", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This document provides instructions for correcting merged paragraphs identified during the OCR process. The separation is achieved based on specific characters such as (i), (ii), (iii), (a), (b), and so on.\n" + ] + }, + { + "cell_type": "markdown", + "id": "319ce84a-f609-42ed-8487-259bc5ddbfd7", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "* Vertex AI Notebook\n", + "* Documents in GCS Folder\n", + "* Output folder to upload fixed documents\n" + ] + }, + { + "cell_type": "markdown", + "id": "0197c79c-60b8-4d28-b8a8-93388f628729", + "metadata": {}, + "source": [ + "## Step by Step procedure" + ] + }, + { + "cell_type": "markdown", + "id": "219968c8-43ad-4514-889c-081c5639e69c", + "metadata": {}, + "source": [ + "### 1.Importing Required Modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6980aec4-321f-4f61-a149-e2dc449ae28f", + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5518f964-bac3-4b5b-809f-fda815d97826", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import re\n", + "import time\n", + "import warnings\n", + "import utilities\n", + "import io\n", + "import base64\n", + "import gcsfs\n", + "import numpy as np\n", + "import pandas as pd\n", + "import itertools\n", + "\n", + "from itertools import cycle\n", + "from PIL import Image, ImageDraw, ImageFont\n", + "from PyPDF2 import PdfFileReader\n", + "from google.auth import credentials\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "from google.cloud import storage\n", + "from tqdm import tqdm\n", + "from io import BytesIO\n", + "from pathlib import Path\n", + "from pprint import pprint\n", + "from typing import (\n", + " Container,\n", + " Dict,\n", + " Iterable,\n", + " Iterator,\n", + " List,\n", + " Mapping,\n", + " Optional,\n", + " Sequence,\n", + " Tuple,\n", + " Union,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "238958bf-a224-4e83-a300-03851100b909", + "metadata": {}, + "source": [ + "### 2.Setup the Inputs\n", + "\n", + "* `input_uri`: This contains the storage bucket path of the input files. \n", + "* `output_bucket_name`: Your output bucket name.\n", + "* `base_file_path`: Base path within the bucket for storing output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08906484-2fb8-4c13-a34e-eb9729bd5a81", + "metadata": {}, + "outputs": [], + "source": [ + "# Input parameters:\n", + "input_uri = \"gs://xxxxxxx/xxxxxxxxxx/xxxxxxxxx/xxxxxxxx/\"\n", + "output_bucket_name = \"xxxxxxxxxx\"\n", + "base_file_path = \"xxxxxx/xxxxxxxx/\" # Base path within the bucket" + ] + }, + { + "cell_type": "markdown", + "id": "d76fc7b4-2146-4280-85f5-80fd8a5a0fb0", + "metadata": {}, + "source": [ + "### 3.Run the below functions used in this tool" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0506b57-6f71-482f-adcb-1e3bd75c2e8c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def convert_base64_to_image(base64_text: str):\n", + " \"\"\"\n", + " Converts a base64 encoded text to an image.\n", + "\n", + " Args:\n", + " base64_text (str): A string containing the base64 encoded data of an image.\n", + " It can optionally start with 'data:image/png;base64,'.\n", + "\n", + " Returns:\n", + " Image: An image object created from the base64 encoded data.\n", + " \"\"\"\n", + " try:\n", + " image = Image.open(io.BytesIO(base64_text))\n", + " return image\n", + " except IOError:\n", + " print(\"Error in loading the image. The image data might be corrupted.\")\n", + " return None\n", + "\n", + "\n", + "def highlight_text_in_images(json_data: object) -> None:\n", + " \"\"\"\n", + " Process JSON data to extract images and highlight text segments.\n", + " \"\"\"\n", + " image_pages = []\n", + " for page in json_data.pages:\n", + " tokens = page.paragraphs\n", + " base64_text = page.image.content\n", + " image = convert_base64_to_image(base64_text)\n", + " draw = ImageDraw.Draw(image)\n", + " border_width = 4\n", + " text = json_data.text\n", + "\n", + " color_iterator = itertools.cycle(\n", + " [\"red\", \"green\", \"blue\", \"purple\", \"orange\"]\n", + " ) # Example colors\n", + "\n", + " for entity in tokens:\n", + " try:\n", + " # Initialize variables to store the minimum start index and maximum end index\n", + " min_start_index = float(\"inf\")\n", + " max_end_index = -1\n", + "\n", + " # Iterate over all text segments to find the min start index and max end index\n", + " for segment in entity.layout.text_anchor.text_segments:\n", + " start_index = int(segment.start_index)\n", + " end_index = int(segment.end_index)\n", + " min_start_index = min(min_start_index, start_index)\n", + " max_end_index = max(max_end_index, end_index)\n", + "\n", + " # Extract and clean the substring\n", + " substring = text[min_start_index : max_end_index - 2]\n", + " substring = \"\".join(\n", + " ch for ch in substring if ord(ch) < 128\n", + " ) # Keep only ASCII characters\n", + "\n", + " vertices = [\n", + " (v.x * image.width, v.y * image.height)\n", + " for v in entity.layout.bounding_poly.normalized_vertices\n", + " ]\n", + "\n", + " # Get the next color from the iterator\n", + " border_color = next(color_iterator)\n", + "\n", + " # Draw a border with the selected color\n", + " for i in range(border_width):\n", + " border_vertices = [(v[0] - i - 1, v[1] - i - 1) for v in vertices]\n", + " draw.polygon(border_vertices, outline=border_color)\n", + "\n", + " except KeyError:\n", + " pass\n", + "\n", + " image_pages.append(image)\n", + "\n", + " # Display each image\n", + " for img in image_pages:\n", + " display(img)\n", + "\n", + "\n", + "pattern = r\"\"\"\n", + "(? List:\n", + " matches = list(re.finditer(pattern, text, re.VERBOSE))\n", + " # Initialize a list to store the resulting paragraph indices\n", + " paragraphs_list = []\n", + "\n", + " # Check for matches\n", + " if matches:\n", + " # The start of the first paragraph is the start of the text\n", + " start = 0\n", + " # Loop over the matches\n", + " for match in matches:\n", + " # The end of the current paragraph is the start of the next bullet point\n", + " end = (\n", + " match.start() + 1\n", + " ) # we add 1 because we want to ignore the '\\n' that's captured in the regex\n", + " # Append the current start and end indices to the list if they are not the same\n", + " if start != end:\n", + " paragraphs_list.append((start, end))\n", + " # The start of the next paragraph is the start of the current bullet point\n", + " start = match.start() + 1 # again, we add 1 to ignore the '\\n'\n", + "\n", + " # The end of the last paragraph is the end of the text\n", + " paragraphs_list.append((start, len(text)))\n", + " # print(paragraphs_list)\n", + " else:\n", + " # If no bullet points are found, the entire text is one paragraph\n", + " # paragraphs_list.append((0, len(text)))\n", + " pass\n", + "\n", + " return paragraphs_list\n", + "\n", + "\n", + "def get_token(\n", + " doc: object, page: int, text_anchor: List\n", + ") -> Tuple[Dict[str, object], Dict[str, object]]:\n", + " \"\"\"\n", + " Uses loaded JSON, page number, and text anchors as input and gives the text anchors and page anchors.\n", + "\n", + " Args:\n", + " - json_dict (Any): Loaded JSON.\n", + " - page (int): Page number.\n", + " - text_anchors_check (List): List of text anchors.\n", + "\n", + " Returns:\n", + " - Tuple[Dict[str, Any], Dict[str, Any]]: A tuple containing:\n", + " - text_anchors (Dict[str, Any]): Text anchors.\n", + " - page_anchors (Dict[str, Any]): Page anchors.\n", + " \"\"\"\n", + " min_x_normalized = float(\"inf\")\n", + " min_x = float(\"inf\")\n", + " temp_ver_normalized = {\"x\": [], \"y\": []}\n", + " temp_ver = {\"x\": [], \"y\": []}\n", + " temp_text_anc = documentai.Document.TextAnchor()\n", + " temp_confidence = []\n", + " for token in doc.pages[page].tokens:\n", + " if not token.layout.text_anchor.text_segments[0].start_index:\n", + " token.layout.text_anchor.text_segments[0].start_index = 0\n", + " token_anc = token.layout.text_anchor.text_segments[0]\n", + " if token.layout.text_anchor.text_segments == text_anchor.text_segments:\n", + " text_temp = doc.text[\n", + " int(token.layout.text_anchor.text_segments[0].start_index) : int(\n", + " token.layout.text_anchor.text_segments[0].end_index\n", + " )\n", + " ]\n", + " if len(text_temp) > 2 or (\"\\n\" not in text_temp and len(text_temp) <= 2):\n", + " vertices = token.layout.bounding_poly\n", + " min_x_normalized = min(\n", + " vertex.x for vertex in vertices.normalized_vertices\n", + " )\n", + " min_y_normalized = min(\n", + " vertex.y for vertex in vertices.normalized_vertices\n", + " )\n", + " max_x_normalized = max(\n", + " vertex.x for vertex in vertices.normalized_vertices\n", + " )\n", + " max_y_normalized = max(\n", + " vertex.y for vertex in vertices.normalized_vertices\n", + " )\n", + " min_x = min(vertex.x for vertex in vertices.vertices)\n", + " min_y = min(vertex.y for vertex in vertices.vertices)\n", + " max_x = max(vertex.x for vertex in vertices.vertices)\n", + " max_y = max(vertex.y for vertex in vertices.vertices)\n", + " confidence = token.layout.confidence\n", + " temp_text_anc.text_segments = token.layout.text_anchor.text_segments\n", + " elif (\n", + " int(token_anc.start_index)\n", + " >= int(text_anchor.text_segments[0].start_index) - 2\n", + " and int(token_anc.end_index)\n", + " <= int(text_anchor.text_segments[0].end_index) + 2\n", + " ):\n", + " text_temp = doc.text[\n", + " int(token.layout.text_anchor.text_segments[0].start_index) : int(\n", + " token.layout.text_anchor.text_segments[0].end_index\n", + " )\n", + " ]\n", + " if len(text_temp) > 2 or (\"\\n\" not in text_temp and len(text_temp) <= 2):\n", + " vertices = token.layout.bounding_poly\n", + " min_x_normalized = min(\n", + " vertex.x for vertex in vertices.normalized_vertices\n", + " )\n", + " min_y_normalized = min(\n", + " vertex.y for vertex in vertices.normalized_vertices\n", + " )\n", + " max_x_normalized = max(\n", + " vertex.x for vertex in vertices.normalized_vertices\n", + " )\n", + " max_y_normalized = max(\n", + " vertex.y for vertex in vertices.normalized_vertices\n", + " )\n", + " min_x = min(vertex.x for vertex in vertices.vertices)\n", + " min_y = min(vertex.y for vertex in vertices.vertices)\n", + " max_x = max(vertex.x for vertex in vertices.vertices)\n", + " max_y = max(vertex.y for vertex in vertices.vertices)\n", + " temp_ver_normalized[\"x\"].extend([min_x_normalized, max_x_normalized])\n", + " temp_ver_normalized[\"y\"].extend([min_y_normalized, max_y_normalized])\n", + " temp_ver[\"x\"].extend([min_x, max_x])\n", + " temp_ver[\"y\"].extend([min_y, max_y])\n", + " text_anc_token = token.layout.text_anchor.text_segments\n", + " for an1 in text_anc_token:\n", + " temp_text_anc.text_segments.append(an1)\n", + " confidence = token.layout.confidence\n", + " temp_confidence.append(confidence)\n", + " if min_x_normalized == float(\"inf\") or min_x == float(\"inf\"):\n", + " for token in doc.pages[page].tokens:\n", + " if not token.layout.text_anchor.text_segments[0].start_index:\n", + " token.layout.text_anchor.text_segments[0].start_index = 0\n", + " if (\n", + " abs(\n", + " int(token.layout.text_anchor.text_segments[0].start_index)\n", + " - int(token.layout.text_anchor.text_segments[0].end_index)\n", + " )\n", + " <= 2\n", + " ):\n", + " text_temp = doc.text[\n", + " int(token.layout.text_anchor.text_segments[0].start_index) : int(\n", + " token.layout.text_anchor.text_segments[0].end_index\n", + " )\n", + " ]\n", + " vertices = token.layout.bounding_poly\n", + " min_x_normalized = min(\n", + " vertex.x for vertex in vertices.normalized_vertices\n", + " )\n", + " min_y_normalized = min(\n", + " vertex.y for vertex in vertices.normalized_vertices\n", + " )\n", + " max_x_normalized = max(\n", + " vertex.x for vertex in vertices.normalized_vertices\n", + " )\n", + " max_y_normalized = max(\n", + " vertex.y for vertex in vertices.normalized_vertices\n", + " )\n", + " min_x = min(vertex.x for vertex in vertices.vertices)\n", + " min_y = min(vertex.y for vertex in vertices.vertices)\n", + " max_x = max(vertex.x for vertex in vertices.vertices)\n", + " max_y = max(vertex.y for vertex in vertices.vertices)\n", + " temp_text_anc.text_segments = token.layout.text_anchor.text_segments\n", + " confidence = token.layout.confidence\n", + " if len(temp_text_anc.text_segments) != 0:\n", + " final_ver_normalized = {\n", + " \"min_x\": min(temp_ver_normalized[\"x\"]),\n", + " \"min_y\": min(temp_ver_normalized[\"y\"]),\n", + " \"max_x\": max(temp_ver_normalized[\"x\"]),\n", + " \"max_y\": max(temp_ver_normalized[\"y\"]),\n", + " }\n", + " final_ver = {\n", + " \"min_x\": min(temp_ver[\"x\"]),\n", + " \"min_y\": min(temp_ver[\"y\"]),\n", + " \"max_x\": max(temp_ver[\"x\"]),\n", + " \"max_y\": max(temp_ver[\"y\"]),\n", + " }\n", + " final_confidence = min(temp_confidence)\n", + " final_text_anc = sorted(temp_text_anc.text_segments, key=lambda x: x.end_index)\n", + " return final_ver, final_ver_normalized, final_text_anc, final_confidence\n", + " else:\n", + " return (\n", + " {\"min_x\": min_x, \"min_y\": min_y, \"max_x\": max_x, \"max_y\": max_y},\n", + " {\n", + " \"min_x\": min_x_normalized,\n", + " \"min_y\": min_y_normalized,\n", + " \"max_x\": max_x_normalized,\n", + " \"max_y\": max_y_normalized,\n", + " },\n", + " text_anc_token,\n", + " confidence,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5c44925-6156-4a09-8f7a-14fe569eb8cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "list_of_files, file_name_dict = utilities.file_names(input_uri)\n", + "input_bucket_name = input_uri.split(\"/\")[2]\n", + "for i in list_of_files:\n", + " doc = utilities.documentai_json_proto_downloader(\n", + " input_bucket_name, file_name_dict[i]\n", + " )\n", + " text = doc.text\n", + " for page_number, page in enumerate(doc.pages):\n", + " new_paragraphs = []\n", + " paragraph_indices = split_into_paragraphs(text)\n", + "\n", + " if len(paragraph_indices) > 1:\n", + " for index in paragraph_indices:\n", + " try:\n", + " start_index = index[0]\n", + " end_index = index[1] - 3\n", + " new_paragraph = documentai.Document.Page.Paragraph()\n", + " text_segment = documentai.Document.TextAnchor.TextSegment()\n", + " text_segment.start_index = start_index\n", + " text_segment.end_index = end_index\n", + " new_paragraph.layout.text_anchor.text_segments = [text_segment]\n", + " (\n", + " vertices,\n", + " normalized_vertices,\n", + " text_segments,\n", + " confidence,\n", + " ) = get_token(doc, page_number, new_paragraph.layout.text_anchor)\n", + " new_paragraph.layout.text_anchor.text_segments = text_segments\n", + " new_paragraph.layout.bounding_poly.vertices = [\n", + " {\"x\": vertices[\"min_x\"], \"y\": vertices[\"min_y\"]},\n", + " {\"x\": vertices[\"max_x\"], \"y\": vertices[\"min_y\"]},\n", + " {\"x\": vertices[\"max_x\"], \"y\": vertices[\"max_y\"]},\n", + " {\"x\": vertices[\"min_x\"], \"y\": vertices[\"max_y\"]},\n", + " ]\n", + " new_paragraph.layout.bounding_poly.normalized_vertices = [\n", + " {\n", + " \"x\": normalized_vertices[\"min_x\"],\n", + " \"y\": normalized_vertices[\"min_y\"],\n", + " },\n", + " {\n", + " \"x\": normalized_vertices[\"max_x\"],\n", + " \"y\": normalized_vertices[\"min_y\"],\n", + " },\n", + " {\n", + " \"x\": normalized_vertices[\"max_x\"],\n", + " \"y\": normalized_vertices[\"max_y\"],\n", + " },\n", + " {\n", + " \"x\": normalized_vertices[\"min_x\"],\n", + " \"y\": normalized_vertices[\"max_y\"],\n", + " },\n", + " ]\n", + " new_paragraphs.append(new_paragraph)\n", + " except:\n", + " pass\n", + "\n", + " page.paragraphs.clear()\n", + " page.paragraphs.extend(new_paragraphs)\n", + "\n", + " highlight_text_in_images(doc)\n", + " file_name_only = file_name_dict[i].split(\"/\")[-1]\n", + " full_file_path = base_file_path + file_name_only\n", + " utilities.store_document_as_json(\n", + " documentai.Document.to_json(doc), output_bucket_name, full_file_path\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "1b5df15c-9839-4fd7-9ee1-6282da97b12d", + "metadata": {}, + "source": [ + "## Results\n", + "\n", + "The fixed documents are saved in the output bucket which you have provided in the script with the same folder structure in input URI." + ] + }, + { + "cell_type": "markdown", + "id": "8609a1df-16e6-45d3-984e-3f5754c49f07", + "metadata": { + "tags": [] + }, + "source": [ + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12bbd529-2a33-430f-994e-b137c69c9916", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "conda-root-py", + "name": "workbench-notebooks.m113", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/schema_converter_tool/images/new_schema_dataframe.png b/incubator-tools/schema_converter_tool/images/new_schema_dataframe.png new file mode 100644 index 000000000..4a5deefc2 Binary files /dev/null and b/incubator-tools/schema_converter_tool/images/new_schema_dataframe.png differ diff --git a/incubator-tools/schema_converter_tool/images/new_schema_json.png b/incubator-tools/schema_converter_tool/images/new_schema_json.png new file mode 100644 index 000000000..1227de32e Binary files /dev/null and b/incubator-tools/schema_converter_tool/images/new_schema_json.png differ diff --git a/incubator-tools/schema_converter_tool/schema_converter_tool.ipynb b/incubator-tools/schema_converter_tool/schema_converter_tool.ipynb new file mode 100644 index 000000000..dce98ddff --- /dev/null +++ b/incubator-tools/schema_converter_tool/schema_converter_tool.ipynb @@ -0,0 +1,355 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a83ef9fd-f2a6-48b3-a381-7688ea128d8f", + "metadata": {}, + "source": [ + "# Schema Converter Tool User’s Guide" + ] + }, + { + "cell_type": "markdown", + "id": "f9ef8ffd-2e94-4529-bf84-755fe92a4c76", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "c30ac1c2-cc63-4bbf-a270-b54765ea90b4", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.\n" + ] + }, + { + "cell_type": "markdown", + "id": "c2004da7-69ee-40d8-b295-ea2b73c8101e", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This tool converts the old type schema to New type schema using “Base processor schema” and old schema in json format as input. This new schema can be used for up-training the processor.\n", + "\n", + "**Old Processor Schema**: It is a JSON file which holds schema of specific processor_version of DocAI Processor which is different from current processor. Refer [public documentaion](https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1beta3.services.document_service.DocumentServiceClient#google_cloud_documentai_v1beta3_services_document_service_DocumentServiceClient_get_dataset_schema)\n", + " to download base processor schema. This schema is refered as `old_schema` in this tool\n", + " \n", + "**Base Processor Schema**: It is a JSON file which holds schema of specific processor_version of DocAI Processor. Refer [public documentaion](https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1beta3.services.document_service.DocumentServiceClient#google_cloud_documentai_v1beta3_services_document_service_DocumentServiceClient_get_dataset_schema)\n", + " to download base processor schema\n", + " Before up-training you need to have latest/new schema\n", + " \n", + "**New Schema**: It is a JSON file `new_schema` which holds data about schema of old processor schema and base processor schema and entities whic are exist in both base processor schema & old schema(Intersection of these two) and entities old processor\n", + "\n", + " \n", + "\n", + "In below screenshot\n", + "* **both**: entity exists in both old processor & base processor schema\n", + "* **custom schema only**: entity from old processor schema\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "9d98536f-ac9c-44de-bf1d-5314691fff43", + "metadata": {}, + "source": [ + "## Prerequisites \n", + "\n", + "1.Knowledge of Python, and IO Operations. \n", + "\n", + "2.Python : Jupyter notebook (Vertex) or Google Colab \n", + "\n", + "3.No permissions, reference or access to any Google project is needed.\n", + "\n", + "4.To get processor schema in json format refer documentation [get_dataset_schema](https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1beta3.services.document_service.DocumentServiceClient#google_cloud_documentai_v1beta3_services_document_service_DocumentServiceClient_get_dataset_schema)\n", + "\n", + "5.Current Up-training Schema in old format (json file)\n" + ] + }, + { + "cell_type": "markdown", + "id": "1ed4a86c-433c-4e16-ae01-30de753c406c", + "metadata": {}, + "source": [ + "## Tool Installation Procedure\n", + "\n", + "The tool consists of some Python code. It can be loaded and run via: \n", + "\n", + "1.From Google Colab - make your own copy of this template, \n", + " or \n", + "2.The code can also be copied from the appendix of this document and copied into a Google Colab or Vertex Notebook.\n" + ] + }, + { + "cell_type": "markdown", + "id": "cc28b63d-9f0e-4773-8b74-e0de66e0bbde", + "metadata": {}, + "source": [ + "## Tool Operation Procedure\n", + "\n", + "1. Copy the path of your old schema json file and paste in the old_schema_path as shown below." + ] + }, + { + "cell_type": "markdown", + "id": "0f6b58c9-90d4-4a02-9ae8-2df762cbb261", + "metadata": {}, + "source": [ + "2. Copy the path of your Base processor schema json file and paste in the base _schema_json_path as shown below." + ] + }, + { + "cell_type": "markdown", + "id": "cc5cdbe9-11b8-4a39-b50b-0b804c97ed60", + "metadata": {}, + "source": [ + "3. After updating the paths, run the entire code and the new schema json file should be created in your current working directory, which can be used for uptraining the processor." + ] + }, + { + "cell_type": "markdown", + "id": "aab258ea-107a-4bf0-8c86-07e8d92c0bf1", + "metadata": {}, + "source": [ + "# Run the code" + ] + }, + { + "cell_type": "markdown", + "id": "d6ebbf40-acc9-4b16-a111-af354c75e41c", + "metadata": {}, + "source": [ + "### Installing Required libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "be790463-37e4-4591-b43c-46df64f99e17", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "import copy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8d0dbc36-b480-44be-b0ae-c73961f7e7c5", + "metadata": {}, + "outputs": [], + "source": [ + "old_schema_path = \"old_schema.json\"\n", + "with open(old_schema_path, \"r\") as f:\n", + " old_schema = json.loads(f.read())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "43fc3705-19a3-4ec8-bd58-e24bfd7d500b", + "metadata": {}, + "outputs": [], + "source": [ + "BASE_SCHEMA_JSON_PATH = \"base_processor_version_info.json\"\n", + "with open(BASE_SCHEMA_JSON_PATH, \"r\") as f:\n", + " base_processor_version = json.loads(f.read())[\n", + " \"documentSchema\"\n", + " ] # If base processor version is available\n", + " # base_processor_version=json.loads(f.read()) # If directly schema is available" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "092efc79-9caf-4b4b-86cc-bb9bc5ec5322", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'displayName': 'my schema name', 'description': 'my new schema for uptrain', 'metadata': {'prefixedNamingOnProperties': True}, 'entityTypes': [{'name': 'invoice_document_type', 'baseTypes': ['document'], 'properties': [{'name': 'line_item', 'valueType': 'line_item', 'occurrenceType': 'OPTIONAL_MULTIPLE'}, {'name': 'total_amount', 'valueType': 'money', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'purchase_order_date', 'valueType': 'datetime', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'purchase_order_id', 'valueType': 'string', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'ship_to_address', 'valueType': 'address', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'ship_to_name', 'valueType': 'string', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'delivery_date', 'valueType': 'datetime', 'occurrenceType': 'OPTIONAL_ONCE'}]}, {'name': 'line_item', 'baseTypes': ['object'], 'properties': [{'name': 'line_item/amount', 'valueType': 'money', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'line_item/description', 'valueType': 'string', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'line_item/quantity', 'valueType': 'number', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'line_item/unit_of_measure', 'valueType': 'string', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'line_item/unit_price', 'valueType': 'money', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'line_item/product_code', 'valueType': 'string', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'line_item/upc_code', 'valueType': 'string', 'occurrenceType': 'OPTIONAL_ONCE'}, {'name': 'line_item/customer_item_no', 'valueType': 'string', 'occurrenceType': 'OPTIONAL_ONCE'}]}]}\n" + ] + } + ], + "source": [ + "def schema_detect(s):\n", + " flag = False\n", + " # checking whether schema is old or new type and converting to pandas dataframe\n", + " df1 = pd.DataFrame()\n", + " df_new2 = None\n", + " for i in range(len(s[\"entityTypes\"])):\n", + " if \"properties\" in s[\"entityTypes\"][i].keys():\n", + " flag = True\n", + " break\n", + " if flag:\n", + " # print('New Type Schema')\n", + " for i in range(len(s[\"entityTypes\"])):\n", + " if \"properties\" in s[\"entityTypes\"][i].keys():\n", + " for j in range(len(s[\"entityTypes\"][i][\"properties\"])):\n", + " s[\"entityTypes\"][i][\"properties\"][j] = {\n", + " (\"\".join(e for e in k if e.isalnum())).lower(): v\n", + " for k, v in s[\"entityTypes\"][i][\"properties\"][j].items()\n", + " }\n", + " # print(s['entityTypes'][i]['properties'])\n", + " for j in range(len(s[\"entityTypes\"][i][\"properties\"])):\n", + " if df_new2 is not None:\n", + " df1 = pd.concat([df1, df_new2])\n", + " df_new2 = None\n", + " else:\n", + " df_new2 = pd.DataFrame(s[\"entityTypes\"][i][\"properties\"])\n", + " if \"propertymetadata\" in df1.columns:\n", + " df1.drop([\"propertymetadata\"], axis=1, inplace=True)\n", + " df1.rename(columns={\"name\": \"type_schema\"}, inplace=True)\n", + " df1.drop_duplicates(inplace=True, ignore_index=True)\n", + " # print(df1.head())\n", + " else:\n", + " for i in range(len(s[\"entityTypes\"])):\n", + " s[\"entityTypes\"][i] = {\n", + " (\"\".join(e for e in k if e.isalnum())).lower(): v\n", + " for k, v in s[\"entityTypes\"][i].items()\n", + " }\n", + " df1 = pd.DataFrame(s[\"entityTypes\"])\n", + " df1.rename(columns={\"type\": \"type_schema\"}, inplace=True)\n", + " print(\" Old Type Schema\")\n", + " return df1\n", + "\n", + "\n", + "def custom_style1(row):\n", + " if row.values[-1] == \"both\" and (\n", + " row.values[-2] != row.values[-4] or row.values[-3] != row.values[-5]\n", + " ):\n", + " color = \"lightpink\"\n", + " elif row.values[-1] != \"both\":\n", + " color = \"lightyellow\"\n", + " else:\n", + " color = \"lightgreen\"\n", + " return [\"color:black;background-color: %s\" % color] * len(row.values)\n", + "\n", + "\n", + "base_schema_df1 = schema_detect(base_processor_version)\n", + "base_schema_dict = base_schema_df1.set_index(\"type_schema\").T.to_dict()\n", + "\n", + "new_schema = dict()\n", + "new_schema[\"displayName\"] = old_schema[\"displayName\"]\n", + "new_schema[\"description\"] = old_schema[\"description\"]\n", + "new_schema[\"metadata\"] = base_processor_version[\"metadata\"]\n", + "new_schema[\"entityTypes\"] = [\n", + " {\n", + " \"name\": base_processor_version[\"entityTypes\"][0][\"name\"],\n", + " \"baseTypes\": base_processor_version[\"entityTypes\"][0][\"baseTypes\"],\n", + " \"properties\": list(),\n", + " }\n", + "]\n", + "entityTypes = [base_processor_version[\"entityTypes\"][0][\"name\"]]\n", + "for i in old_schema[\"entityTypes\"]:\n", + " if \"/\" in i[\"type\"]:\n", + " if i[\"type\"].split(\"/\")[0] not in entityTypes:\n", + " temp = dict()\n", + " temp[\"name\"] = i[\"type\"].split(\"/\")[0]\n", + " temp[\"baseTypes\"] = [\"object\"]\n", + " temp[\"properties\"] = list()\n", + " new_schema[\"entityTypes\"].append(temp)\n", + " entityTypes.append(i[\"type\"].split(\"/\")[0])\n", + " temp2 = dict()\n", + " temp2[\"name\"] = i[\"type\"].split(\"/\")[0]\n", + " temp2[\"valueType\"] = i[\"type\"].split(\"/\")[0]\n", + " temp2[\"occurrenceType\"] = \"OPTIONAL_MULTIPLE\"\n", + " new_schema[\"entityTypes\"][0][\"properties\"].append(temp2)\n", + "for i in new_schema[\"entityTypes\"]:\n", + " for j in old_schema[\"entityTypes\"]:\n", + " if (\n", + " \"/\" not in j[\"type\"]\n", + " and i[\"name\"] == base_processor_version[\"entityTypes\"][0][\"name\"]\n", + " ):\n", + " temp = {}\n", + " temp[\"name\"] = j[\"type\"]\n", + " if j[\"type\"] in base_schema_dict.keys():\n", + " temp[\"valueType\"] = base_schema_dict[j[\"type\"]][\"valuetype\"]\n", + " temp[\"occurrenceType\"] = base_schema_dict[j[\"type\"]][\"occurrencetype\"]\n", + " else:\n", + " temp[\"valueType\"] = j[\"baseType\"]\n", + " temp[\"occurrenceType\"] = j[\"occurrenceType\"]\n", + "\n", + " i[\"properties\"].append(temp)\n", + " else:\n", + " if i[\"name\"] == j[\"type\"].split(\"/\")[0]:\n", + " temp = {}\n", + " temp[\"name\"] = j[\"type\"]\n", + " if j[\"type\"] in base_schema_dict.keys():\n", + " temp[\"valueType\"] = base_schema_dict[j[\"type\"]][\"valuetype\"]\n", + " temp[\"occurrenceType\"] = base_schema_dict[j[\"type\"]][\n", + " \"occurrencetype\"\n", + " ]\n", + " else:\n", + " temp[\"valueType\"] = j[\"baseType\"]\n", + " temp[\"occurrenceType\"] = \"OPTIONAL_ONCE\"\n", + " i[\"properties\"].append(temp)\n", + "new_schema2 = copy.deepcopy(new_schema)\n", + "my_schema_df2 = schema_detect(new_schema)\n", + "# Merging both the data frame and getting differences\n", + "compare = base_schema_df1.merge(\n", + " my_schema_df2,\n", + " on=\"type_schema\",\n", + " how=\"outer\",\n", + " suffixes=[\"_base\", \"_2\"],\n", + " indicator=True,\n", + ")\n", + "compare[\"_merge\"] = compare[\"_merge\"].replace(\n", + " \"right_only\", \"custom schema only \", regex=True\n", + ")\n", + "compare[\"_merge\"] = compare[\"_merge\"].replace(\n", + " \"left_only\", \"base schema only \", regex=True\n", + ")\n", + "compare.rename(columns={\"_merge\": \"entity_exists_in\"}, inplace=True)\n", + "compare.style.apply(custom_style1, axis=1)\n", + "new_schema_file_name = \"new_schema.json\"\n", + "with open(new_schema_file_name, \"w\") as f:\n", + " f.write(json.dumps(new_schema2, ensure_ascii=False))\n", + "print(new_schema2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89ee49ff-3a60-4f0b-bc7b-2e2c5630b3b2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/specific_format_line_items_tagging/Images/Input.png b/incubator-tools/specific_format_line_items_tagging/Images/Input.png new file mode 100644 index 000000000..ab62f0ecb Binary files /dev/null and b/incubator-tools/specific_format_line_items_tagging/Images/Input.png differ diff --git a/incubator-tools/specific_format_line_items_tagging/Images/format_input.png b/incubator-tools/specific_format_line_items_tagging/Images/format_input.png new file mode 100644 index 000000000..b500185b4 Binary files /dev/null and b/incubator-tools/specific_format_line_items_tagging/Images/format_input.png differ diff --git a/incubator-tools/specific_format_line_items_tagging/Images/output.png b/incubator-tools/specific_format_line_items_tagging/Images/output.png new file mode 100644 index 000000000..80c341a73 Binary files /dev/null and b/incubator-tools/specific_format_line_items_tagging/Images/output.png differ diff --git a/incubator-tools/specific_format_line_items_tagging/README.md b/incubator-tools/specific_format_line_items_tagging/README.md new file mode 100644 index 000000000..35dd0bf73 --- /dev/null +++ b/incubator-tools/specific_format_line_items_tagging/README.md @@ -0,0 +1,53 @@ +# Purpose and Description + +This document provides the functions which can be used to get the line items tagged from a specific format where the default processor is failing to extract the line item entities. + +## Note + +* This tool tags as entities from OCR output , the text below the headers_entities keys will be tagged as an child entity as per value. +* If the line item has multiple lines , it doesnt give desired result and output will be clumsy + +### Input details + +* **project_id** :Your project id +* **Gcs_input_path** : Bucket path where the parsed jsons are stored +* **Gcs_output_path** : Bucket path to save the updated jsons + +### Format specific input + +* To get the line items of a special invoice format document , you need below details to be entered from the format. + +Input_image + +### Headers +* The headers of the invoice have to be given as input in the form of a string as shown below example shown. + +**Headers=’QTY EQUIPMENT Min Day Week Month Amount’** + +### Headers_entities + +* The entities which correspond to the header have to be given in a dictionary format . This is used to map the items under the respective header mapped into the respective value given in the dictionary. + +**headers_entities={'QTY':'line_item/quantity','EQUIPMENT':'line_item/description','Min':'line_item/unit_price','Day':'line_item/unit_price','Week':'line_item/unit_price','Month':'line_item/unit_price','4 Week':'line_item/unit_price','Amount':'line_item/amount'}** + +### Stop_word +* The stop word helps us to identify the line items where it is getting ended and if there is no stop word needed then it can be left as empty, so the function checks the total page from the headers. + +**stop_word='SALES ITEMS'** + +### Reference entity +* The Entity which has to be tagged first or exists in all the line items have to be specified for better performance of the tool. + +**consider_ent='Amount'** + +### Output details + +* Before and after the postprocessing code + +* Before post processing code + +format_input_image + +* After using Post processing code + +output_image diff --git a/incubator-tools/specific_format_line_items_tagging/specific_format_line_items_tagging.ipynb b/incubator-tools/specific_format_line_items_tagging/specific_format_line_items_tagging.ipynb new file mode 100644 index 000000000..9c14c3b56 --- /dev/null +++ b/incubator-tools/specific_format_line_items_tagging/specific_format_line_items_tagging.ipynb @@ -0,0 +1,948 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6589fc93-39d1-4d10-be1f-e7eb33fe4087", + "metadata": {}, + "source": [ + "# Format Based Line Items Extractor (Post-Processing) User Guide\n" + ] + }, + { + "cell_type": "markdown", + "id": "5bf22bf7-4a47-4f3a-9eef-6f19348a5250", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "361f188e-fe11-4a49-b7c8-080e0e69ce7a", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. \n", + "It is provided and supported on a best-effort basis by the DocAI Incubator Team.\n", + "No guarantees of performance are implied. \n" + ] + }, + { + "cell_type": "markdown", + "id": "1036937a-0221-48eb-862e-3fa0b8e646a8", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This document provides the functions which can be used to get the line items tagged \n", + "from a specific format where the default processor is failing to extract the line item entities.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "83aad68e-ed06-4bf8-96dd-8419938cc3cf", + "metadata": {}, + "source": [ + "## Note\n", + "\n", + "* This tool tags as entities from OCR output , the text below the headers_entities keys will be tagged as an child entity as per value.\n", + "* If the line item has multiple lines , it doesnt give desired result and output will be clumsy\n" + ] + }, + { + "cell_type": "markdown", + "id": "115a4e82-5e83-468a-b0e5-097ca14f15d5", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "* Vertex AI Notebook Or Colab (If using Colab, use authentication)\n", + "* Storage Bucket for storing input and output json files\n", + "* Permission For Google Storage and Vertex AI Notebook.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "fe81de40-5c62-4c0b-adea-937f957b1a6e", + "metadata": {}, + "source": [ + "## Step by Step procedure" + ] + }, + { + "cell_type": "markdown", + "id": "142123d3-37b1-4aa8-841c-40c3bd52d70c", + "metadata": {}, + "source": [ + "### 1. Importing Required Modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0643c5f9-29fe-4252-9e6c-e2afc8c2f2b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install google-cloud-storage google-cloud-documentai==2.16.0 tqdm json\n", + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7588c13e-0e09-4a76-8c21-85a68ee262c6", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import storage\n", + "from utilities import *\n", + "import json\n", + "from tqdm import tqdm\n", + "from pprint import pprint\n", + "from utilities import *\n", + "import re\n", + "from typing import Dict, List, Any, Tuple" + ] + }, + { + "cell_type": "markdown", + "id": "fd7c8c4c-68b8-413c-b4bc-c66f044d3b7a", + "metadata": {}, + "source": [ + "### 2. Input and Output Paths" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ab32e1c4-2bf1-49ab-89d6-2d22443c3f47", + "metadata": {}, + "outputs": [], + "source": [ + "project_id = \"xxxx-xxxx-xxxx\" # project id\n", + "gcs_input_path = \"gs://xxxx/xxxx/xxx\" # path where the parsed jsons are stored\n", + "gcs_output_path = \"gs://xxxx/xxxx/xxx/\" # path to save the updated jsons\n", + "headers = \"QTY EQUIPMENT Min Day Week Month Amount\" # sample headers\n", + "# header entities with corresponding headers\n", + "headers_entities = {\n", + " \"QTY\": \"line_item/quantity\",\n", + " \"EQUIPMENT\": \"line_item/description\",\n", + " \"Min\": \"line_item/unit_price\",\n", + " \"Day\": \"line_item/unit_price\",\n", + " \"Week\": \"line_item/unit_price\",\n", + " \"Month\": \"line_item/unit_price\",\n", + " \"4 Week\": \"line_item/unit_price\",\n", + " \"Amount\": \"line_item/amount\",\n", + "}\n", + "stop_word = \"SALES ITEMS\" # stop word where the line items should be stopped\n", + "consider_ent = \"Amount\" # reference entity which has to be tagged as first or present in all the line items." + ] + }, + { + "cell_type": "markdown", + "id": "7fd2de60-7d7d-4051-8f09-3429ba116f92", + "metadata": {}, + "source": [ + "### Format specific input \n", + "\n", + "* To get the line items of a special invoice format document , you need below details to be entered from the format.\n", + "\n", + "\n", + "\n", + "### Headers:\n", + "* The headers of the invoice have to be given as input in the form of a string as shown below example shown.\n", + "\n", + "**Headers=’QTY EQUIPMENT Min Day Week Month Amount’**\n", + "\n", + "### Headers_entities:\n", + "\n", + "* The entities which correspond to the header have to be given in a dictionary format . This is used to map the items under the respective header mapped into the respective value given in the dictionary.\n", + "\n", + "**headers_entities={'QTY':'line_item/quantity','EQUIPMENT':'line_item/description','Min':'line_item/unit_price','Day':'line_item/unit_price','Week':'line_item/unit_price','Month':'line_item/unit_price','4 Week':'line_item/unit_price','Amount':'line_item/amount'}**\n", + "\n", + "### Stop_word:\n", + "* The stop word helps us to identify the line items where it is getting ended and if there is no stop word needed then it can be left as empty, so the function checks the total page from the headers. \n", + "\n", + "**stop_word='SALES ITEMS'**\n", + "\n", + "### Reference entity:\n", + " * The Entity which has to be tagged first or exists in all the line items have to be specified for better performance of the tool.\n", + "\n", + "**consider_ent='Amount'**" + ] + }, + { + "cell_type": "markdown", + "id": "737d1c70-fef5-49e3-a266-695bf8076a54", + "metadata": {}, + "source": [ + "### 3. Run the Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c22bfdd-abdc-4d1c-8f7c-86164e7c4103", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# functions\n", + "def get_page_wise_entities(json_dict: documentai.Document):\n", + " \"\"\"\n", + " Extracts entities from each page in the given loaded JSON file.\n", + "\n", + " Args:\n", + " - json_dict (Dict[str, Any]): Loaded JSON file containing entities.\n", + "\n", + " Returns:\n", + " - Dict[str, List[Any]]: A dictionary where keys are page identifiers\n", + " and values are lists of entities associated with each page.\n", + " \"\"\"\n", + "\n", + " entities_page = {}\n", + " for entity in json_dict.entities:\n", + " page = entity.page_anchor.page_refs[0].page\n", + " if page in entities_page.keys():\n", + " entities_page[page].append(entity)\n", + " else:\n", + " entities_page[page] = [entity]\n", + "\n", + " return entities_page\n", + "\n", + "\n", + "def get_text_anc_headers(\n", + " json_dict: documentai.Document,\n", + " page: int,\n", + " headers: str,\n", + " headers_entities: dict = None,\n", + "):\n", + " \"\"\"\n", + " THIS FUNCTION WILL SEARCH FOR THE HEADERS STRING IN LOADED JSON\n", + " USING FIRST AND LAST AS KEY WORDS AND GETS MINIMUM,\n", + " MAXIMUM OF X &Y COORDINTES OF THE HEADERS IN DICTIONARY FORMAT\n", + "\n", + " Args: Loaded JSON\n", + " Page Number\n", + " headers\n", + " headers_entities\n", + "\n", + " Returns: {'header_keyword: {'min_x':0....,'min_y':0....,'max_x':0....,'max_y':0....}}\n", + "\n", + " \"\"\"\n", + "\n", + " import re\n", + "\n", + " pattern = r\"{}.*{}\".format(\n", + " re.escape(headers.split(\" \")[0]), re.escape(headers.split(\" \")[-1])\n", + " )\n", + " match = re.search(pattern, json_dict.text, flags=re.DOTALL)\n", + " start = match.start()\n", + " end_temp = json_dict.text[start : start + 50].find(headers.split(\" \")[-1])\n", + " end = start + end_temp + len(headers.split(\" \")[-1])\n", + " if end - start >= len(headers):\n", + " ent_index = {}\n", + " if headers_entities == []:\n", + " for col in headers.split(\" \"):\n", + " start_temp = (json_dict.text[start:end].lower()).find(col.lower())\n", + " end_temp = start_temp + start + len(col) + 1\n", + " ent_index[col] = {\n", + " \"start_index\": str(start_temp + start),\n", + " \"end_index\": str(end_temp),\n", + " }\n", + " else:\n", + " for col in headers_entities.keys():\n", + " if col.lower() in json_dict.text[start:end].lower():\n", + " start_temp = (json_dict.text[start:end].lower()).find(col.lower())\n", + " end_temp = start_temp + start + len(col) + 1\n", + " ent_index[col] = {\n", + " \"start_index\": str(start_temp + start),\n", + " \"end_index\": str(end_temp),\n", + " }\n", + " ent_min_dict = {}\n", + " for col, anc in ent_index.items():\n", + " try:\n", + " ent_min_dict[col] = get_token_from_text_anc(json_dict, page, anc)\n", + " except:\n", + " pass\n", + "\n", + " return ent_min_dict\n", + "\n", + "\n", + "def get_token_xy(token: Any) -> Tuple[float, float, float, float]:\n", + " \"\"\"\n", + " Extracts the normalized bounding box coordinates (min_x, min_y, max_x, max_y) of a token.\n", + "\n", + " Args:\n", + " - token (Any): A token object with layout information.\n", + "\n", + " Returns:\n", + " - Tuple[float, float, float, float]: The normalized bounding box coordinates.\n", + "\n", + " \"\"\"\n", + " vertices = token.layout.bounding_poly.normalized_vertices\n", + " minx_token, miny_token = min(point.x for point in vertices), min(\n", + " point.y for point in vertices\n", + " )\n", + " maxx_token, maxy_token = max(point.x for point in vertices), max(\n", + " point.y for point in vertices\n", + " )\n", + "\n", + " return minx_token, miny_token, maxx_token, maxy_token\n", + "\n", + "\n", + "def get_token_from_text_anc(\n", + " json_dict: documentai.Document, page_num: int, text_anchors_check: Dict[str, str]\n", + ") -> Dict[str, float]:\n", + " \"\"\"\n", + " Extracts the x and y coordinates of a token based on the provided text anchors.\n", + "\n", + " Args:\n", + " - json_dict (Dict[str, Any]): Loaded JSON.\n", + " - page_num (int): Page number.\n", + " - text_anchors_check (Dict[str, str]): Text anchors to check for.\n", + "\n", + " Returns:\n", + " - Dict[str, float]: Dictionary containing x and y coordinates {'min_x': float,\n", + " 'min_y': float, 'max_x': float, 'max_y': float}.\n", + " \"\"\"\n", + "\n", + " for page in json_dict.pages:\n", + " if int(page_num) == int(page.page_number - 1):\n", + " for token in page.tokens:\n", + " for seg in token.layout.text_anchor.text_segments:\n", + " if (\n", + " seg.start_index == text_anchors_check[\"start_index\"]\n", + " and seg.end_index == text_anchors_check[\"end_index\"]\n", + " ):\n", + " minx_token, miny_token, maxx_token, maxy_token = get_token_xy(\n", + " token\n", + " )\n", + " elif (\n", + " abs(\n", + " int(seg.start_index)\n", + " - int(text_anchors_check[\"start_index\"])\n", + " )\n", + " <= 2\n", + " and abs(\n", + " int(seg.end_index) - int(text_anchors_check[\"end_index\"])\n", + " )\n", + " <= 2\n", + " ):\n", + " minx_token, miny_token, maxx_token, maxy_token = get_token_xy(\n", + " token\n", + " )\n", + "\n", + " return {\n", + " \"min_x\": minx_token,\n", + " \"min_y\": miny_token,\n", + " \"max_x\": maxx_token,\n", + " \"max_y\": maxy_token,\n", + " }\n", + "\n", + "\n", + "def get_entity_new(\n", + " mt_new: str,\n", + " norm_ver: List[Dict[str, float]],\n", + " text_seg: List[Dict[str, Any]],\n", + " type_line: str,\n", + " line_item: bool,\n", + ") -> Dict[str, Any]:\n", + " \"\"\"\n", + " Generates a new entity based on the provided parameters.\n", + "\n", + " Args:\n", + " - mt_new (str): Mention text.\n", + " - norm_ver (List[Dict[str, float]]): Normalized vertices.\n", + " - text_seg (List[Dict[str, Any]]): Text segments.\n", + " - type_line (str): Type of the entity.\n", + " - line_item (bool): True if it's a line item entity, False otherwise.\n", + "\n", + " Returns:\n", + " - Dict[str, Any]: The generated entity.\n", + " \"\"\"\n", + "\n", + " if line_item == True:\n", + " line_item_ent = {\n", + " \"confidence\": 1,\n", + " \"mention_text\": mt_new,\n", + " \"page_anchor\": {\n", + " \"page_refs\": [{\"bounding_poly\": {\"normalized_vertices\": norm_ver}}]\n", + " },\n", + " \"properties\": [],\n", + " \"text_anchor\": {\"text_segments\": text_seg},\n", + " \"type_\": type_line,\n", + " }\n", + " return line_item_ent\n", + " else:\n", + " sub_ent = {\n", + " \"confidence\": 1,\n", + " \"mention_text\": mt_new,\n", + " \"page_anchor\": {\n", + " \"page_refs\": [{\"bounding_poly\": {\"normalized_vertices\": norm_ver}}]\n", + " },\n", + " \"text_anchor\": {\"text_segments\": text_seg},\n", + " \"type_\": type_line,\n", + " }\n", + " return sub_ent\n", + "\n", + "\n", + "def tag_ref_child_item(\n", + " json_dict: documentai.Document,\n", + " page: int,\n", + " ent_min_dict: Dict[str, Dict[str, float]],\n", + " consider_ent: str,\n", + " max_stop_y: float,\n", + ") -> List[Dict[str, Any]]:\n", + " \"\"\"\n", + " THIS FUNCTION USED THE LOADED JSON, PAGE NUMBER , DICTIONARY OF HEADER KEYWORD AND VALUES AS\n", + " X AND Y COORDINATES AND THE STOP WORD Y COORDINATE\n", + "\n", + " ARGS:\n", + " - json_dict (Dict[str, Any]): Loaded JSON.\n", + " - page (int): Page number.\n", + " - ent_min_dict (Dict[str, Dict[str, float]]): Dictionary\n", + " of header keyword and values as X and Y coordinates.\n", + " - consider_ent (str): Entity to be tagged.\n", + " - max_stop_y (float): Stop word Y coordinate.\n", + "\n", + " RETURNS:\n", + " - List[Dict[str, Any]]: List of line items tagging the first entity provided.\n", + " \"\"\"\n", + " # parameter entity needed# ***********need to add some condition\n", + " # to check whether amount int similar to other entities need to add\n", + " page_num = 0\n", + " # consider_ent='Amount'\n", + " consider_type = headers_entities[consider_ent]\n", + " line_items_temp = []\n", + " for page in json_dict.pages:\n", + " if int(page_num) == int(page.page_number - 1):\n", + " for token in page.tokens:\n", + " min_x, min_y, max_x, max_y = get_token_xy(token)\n", + " norm_ver11 = [\n", + " {\"x\": min_x, \"y\": min_y},\n", + " {\"x\": min_x, \"y\": max_y},\n", + " {\"x\": max_x, \"y\": min_y},\n", + " {\"x\": max_x, \"y\": max_y},\n", + " ]\n", + " if (\n", + " min_y > ent_min_dict[consider_ent][\"min_y\"]\n", + " and min_x >= ent_min_dict[consider_ent][\"min_x\"] - 0.002\n", + " and max_x <= ent_min_dict[consider_ent][\"max_x\"] + 0.002\n", + " and max_y < max_stop_y\n", + " ):\n", + " for seg in token.layout.text_anchor.text_segments:\n", + " end_index = seg.end_index\n", + " start_index = seg.start_index\n", + " line_item_ent = get_entity_new(\n", + " json_dict.text[int(start_index) : int(end_index)],\n", + " norm_ver11,\n", + " [{\"start_index\": start_index, \"end_index\": end_index}],\n", + " \"line_item\",\n", + " True,\n", + " )\n", + " sub_ent = get_entity_new(\n", + " json_dict.text[int(start_index) : int(end_index)],\n", + " norm_ver11,\n", + " [{\"start_index\": start_index, \"end_index\": end_index}],\n", + " consider_type,\n", + " False,\n", + " )\n", + " line_item_ent[\"properties\"].append(sub_ent)\n", + " line_items_temp.append(line_item_ent)\n", + " same_y_ent = []\n", + " for dup in line_items_temp:\n", + " temp_same_y = {\"mention_text\": \"\", \"min_y\": \"\", \"max_y\": \"\", \"text_anc\": []}\n", + " temp_same_y[\"mention_text\"] = dup[\"mention_text\"]\n", + " temp_norm_same_y = dup[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"]\n", + " temp_same_y[\"min_y\"] = min(\n", + " vertex[\"y\"] for vertex in temp_norm_same_y[\"normalized_vertices\"]\n", + " )\n", + " temp_same_y[\"max_y\"] = max(\n", + " vertex[\"y\"] for vertex in temp_norm_same_y[\"normalized_vertices\"]\n", + " )\n", + " temp_same_y[\"text_anc\"] = dup[\"text_anchor\"][\"text_segments\"]\n", + " same_y_ent.append(temp_same_y)\n", + " same_y_ent\n", + " sorted_same_y_ent = sorted(same_y_ent, key=lambda x: x[\"min_y\"])\n", + " groups_same_y = []\n", + " current_group = [sorted_same_y_ent[0]]\n", + "\n", + " for i in range(1, len(sorted_same_y_ent)):\n", + " if sorted_same_y_ent[i][\"min_y\"] - current_group[-1][\"min_y\"] < 0.005:\n", + " current_group.append(sorted_same_y_ent[i])\n", + " else:\n", + " groups_same_y.append(current_group)\n", + " current_group = [sorted_same_y_ent[i]]\n", + "\n", + " # Append the last group\n", + " groups_same_y.append(current_group)\n", + "\n", + " if len(groups_same_y) != 0:\n", + " for group in groups_same_y:\n", + " merge_mention_text = \"\"\n", + " merge_text_anc = []\n", + " merge_page_anc_xy = {\"x\": [], \"y\": []}\n", + " merge_type = \"\"\n", + " for dup1 in group:\n", + " for dup2 in line_items_temp:\n", + " if dup2[\"text_anchor\"][\"text_segments\"] == dup1[\"text_anc\"]:\n", + " merge_mention_text = merge_mention_text + dup2[\"mention_text\"]\n", + " for anch2 in dup2[\"text_anchor\"][\"text_segments\"]:\n", + " merge_text_anc.append(anch2)\n", + " norm_dup = dup2[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"][\n", + " \"normalized_vertices\"\n", + " ]\n", + " for norm_dup_xy in norm_dup:\n", + " merge_page_anc_xy[\"x\"].append(norm_dup_xy[\"x\"])\n", + " merge_page_anc_xy[\"y\"].append(norm_dup_xy[\"y\"])\n", + " line_items_temp.remove(dup2)\n", + " dup_minx, dup_miny, dup_maxx, dup_maxy = (\n", + " min(merge_page_anc_xy[\"x\"]),\n", + " min(merge_page_anc_xy[\"y\"]),\n", + " max(merge_page_anc_xy[\"x\"]),\n", + " max(merge_page_anc_xy[\"y\"]),\n", + " )\n", + " dup_norm_ver = [\n", + " {\"x\": dup_minx, \"y\": dup_miny},\n", + " {\"x\": dup_minx, \"y\": dup_maxy},\n", + " {\"x\": dup_maxx, \"y\": dup_miny},\n", + " {\"x\": dup_maxx, \"y\": dup_maxy},\n", + " ]\n", + " line_item_ent3 = get_entity_new(\n", + " merge_mention_text, dup_norm_ver, merge_text_anc, \"line_item\", True\n", + " )\n", + " sub_ent3 = get_entity_new(\n", + " merge_mention_text, dup_norm_ver, merge_text_anc, consider_type, False\n", + " )\n", + " line_item_ent3[\"properties\"].append(sub_ent3)\n", + " line_items_temp.append(line_item_ent3)\n", + "\n", + " return line_items_temp\n", + "\n", + "\n", + "def tagging_rest_child(\n", + " json_dict: documentai.Document,\n", + " page_num: int,\n", + " line_items_temp: List[Dict[str, Any]],\n", + " headers_entities: Dict[str, Any],\n", + " ent_min_dict: Dict[str, Dict[str, float]],\n", + " consider_ent: str,\n", + ") -> List[Dict[str, Any]]:\n", + " \"\"\"\n", + " THIS FUNCTION USES LOADED JSON, PAGE NUMBER AND REFERENCED LINE ITEM LIST\n", + " TAGGED AND HEADER ENTITIES DICTIONARY AND TAGS ALL THE REST OF THE CHILD ITEMS\n", + "\n", + " ARGS:\n", + " - json_dict (Dict[str, Any]): Loaded JSON.\n", + " - page_num (int): Page number.\n", + " - line_items_temp (List[Dict[str, Any]]): Referenced line item list tagged.\n", + " - headers_entities (Dict[str, Any]): Header entities dictionary.\n", + " - ent_min_dict (Dict[str, Dict[str, float]]): Dictionary of header\n", + " keyword and values as X and Y coordinates.\n", + " - consider_ent (str): Entity to be tagged.\n", + "\n", + " RETURNS:\n", + " - List[Dict[str, Any]]: Updated list of line items with tagged child items.\n", + " \"\"\"\n", + " desired_values = [\"line_item/description\", \"description\"]\n", + "\n", + " # Get keys that have the desired value\n", + " matching_keys = [\n", + " key for key, value in headers_entities.items() if value in desired_values\n", + " ]\n", + "\n", + " for line_item in line_items_temp:\n", + " sub_ent_temp = []\n", + " for sub in line_item[\"properties\"]:\n", + " normalized_vertices = sub[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"]\n", + " min_x, min_y = min(\n", + " (vertex[\"x\"], vertex[\"y\"])\n", + " for vertex in normalized_vertices[\"normalized_vertices\"]\n", + " )\n", + " max_x, max_y = max(\n", + " (vertex[\"x\"], vertex[\"y\"])\n", + " for vertex in normalized_vertices[\"normalized_vertices\"]\n", + " )\n", + " for en1, min_xy in ent_min_dict.items():\n", + " temp_mention_text = \"\"\n", + " temp_page_anchor = {\"x\": [], \"y\": []}\n", + " temp_text_anchor = []\n", + " if en1 != consider_ent:\n", + " for page in json_dict.pages:\n", + " if int(page_num) == int(page.page_number - 1):\n", + " for token in page.tokens:\n", + " (\n", + " min_x_token,\n", + " min_y_token,\n", + " max_x_token,\n", + " max_y_token,\n", + " ) = get_token_xy(token)\n", + " if (\n", + " en1 != matching_keys[0]\n", + " and min_xy[\"min_x\"] >= min_x_token - 0.02\n", + " and min_xy[\"max_x\"] <= max_x_token + 0.005\n", + " and abs(min_y - min_y_token) <= 0.005\n", + " ) or (\n", + " en1 == matching_keys[0]\n", + " and min_xy[\"min_x\"] <= min_x_token\n", + " and min_xy[\"max_x\"] >= max_x_token - 0.35\n", + " and abs(min_y - min_y_token) <= 0.005\n", + " ):\n", + " for seg in token.layout.text_anchor.text_segments:\n", + " end_index = seg.end_index\n", + " start_index = seg.start_index\n", + " temp_text_anchor.append(\n", + " {\n", + " \"start_index\": start_index,\n", + " \"end_index\": end_index,\n", + " }\n", + " )\n", + " temp_page_anchor[\"x\"].extend(\n", + " [min_x_token, max_x_token]\n", + " )\n", + " temp_page_anchor[\"y\"].extend(\n", + " [min_y_token, max_y_token]\n", + " )\n", + " temp_mention_text = (\n", + " temp_mention_text\n", + " + json_dict.text[\n", + " int(start_index) : int(end_index)\n", + " ]\n", + " )\n", + " if temp_mention_text != \"\":\n", + " norm_vertices = [\n", + " {\n", + " \"x\": min(temp_page_anchor[\"x\"]),\n", + " \"y\": min(temp_page_anchor[\"y\"]),\n", + " },\n", + " {\n", + " \"x\": min(temp_page_anchor[\"x\"]),\n", + " \"y\": max(temp_page_anchor[\"y\"]),\n", + " },\n", + " {\n", + " \"x\": max(temp_page_anchor[\"x\"]),\n", + " \"y\": min(temp_page_anchor[\"y\"]),\n", + " },\n", + " {\n", + " \"x\": max(temp_page_anchor[\"x\"]),\n", + " \"y\": max(temp_page_anchor[\"y\"]),\n", + " },\n", + " ]\n", + " sub_ent = get_entity_new(\n", + " temp_mention_text,\n", + " norm_vertices,\n", + " temp_text_anchor,\n", + " headers_entities[en1],\n", + " False,\n", + " )\n", + " sub_ent_temp.append(sub_ent)\n", + " for item in sub_ent_temp:\n", + " line_item[\"properties\"].append(item)\n", + " line_item_mention_text = \"\"\n", + " line_item_page_anchor = {\"x\": [], \"y\": []}\n", + " line_item_text_anchor = []\n", + " for sub1 in line_item[\"properties\"]:\n", + " line_item_mention_text = line_item_mention_text + sub1[\"mention_text\"]\n", + " for anch1 in sub1[\"text_anchor\"][\"text_segments\"]:\n", + " line_item_text_anchor.append(anch1)\n", + " norm_temp = sub1[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"][\n", + " \"normalized_vertices\"\n", + " ]\n", + " for i in norm_temp:\n", + " line_item_page_anchor[\"x\"].append(i[\"x\"])\n", + " line_item_page_anchor[\"y\"].append(i[\"y\"])\n", + " min_line_x, min_line_y, max_line_x, max_line_y = (\n", + " min(line_item_page_anchor[\"x\"]),\n", + " min(line_item_page_anchor[\"y\"]),\n", + " max(line_item_page_anchor[\"x\"]),\n", + " max(line_item_page_anchor[\"y\"]),\n", + " )\n", + " line_norm_ver = [\n", + " {\"x\": min_line_x, \"y\": min_line_y},\n", + " {\"x\": min_line_x, \"y\": max_line_y},\n", + " {\"x\": max_line_x, \"y\": min_line_y},\n", + " {\"x\": max_line_x, \"y\": max_line_y},\n", + " ]\n", + " line_item[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"][\n", + " \"normalized_vertices\"\n", + " ] = line_norm_ver\n", + " line_item[\"text_anchor\"][\"text_segments\"] = line_item_text_anchor\n", + " line_item[\"mention_text\"] = line_item_mention_text\n", + "\n", + " return line_items_temp\n", + "\n", + "\n", + "def tag_description_bw_regions(\n", + " json_dict: documentai.Document,\n", + " page_num: int,\n", + " line_items_temp: List[Dict[str, Any]],\n", + " max_stop_y: float,\n", + ") -> List[Dict[str, Any]]:\n", + " \"\"\"\n", + " THIS FUNCTION USED LOADED JSON, PAGE AND LINE ITEMS TAGGED AND MAX Y FROM STOP WORD\n", + " AND GIVES THE UPDATED LINE ITEMS TAGGING THE OCR OUTPUT IN\n", + " BETWEEN THE LINE ITEMS AS line_item/description\n", + "\n", + " ARGS:\n", + " - json_dict (Dict[str, Any]): Loaded JSON.\n", + " - page_num (int): Page number.\n", + " - line_items_temp (List[Dict[str, Any]]): Line items tagged.\n", + " - max_stop_y (float): Max Y from stop word.\n", + "\n", + " RETURNS:\n", + " - List[Dict[str, Any]]: Updated line items with tagged descriptions between them.\n", + " \"\"\"\n", + "\n", + " region = []\n", + " region_line_item = []\n", + " for n1 in range(len(line_items_temp)):\n", + " norm_temp_1 = line_items_temp[n1][\"page_anchor\"][\"page_refs\"][0][\n", + " \"bounding_poly\"\n", + " ][\"normalized_vertices\"]\n", + " y_min_1 = min(vertex[\"y\"] for vertex in norm_temp_1)\n", + " y_max_1 = max(vertex[\"y\"] for vertex in norm_temp_1)\n", + " if n1 < len(line_items_temp) - 1:\n", + " norm_temp_2 = line_items_temp[n1 + 1][\"page_anchor\"][\"page_refs\"][0][\n", + " \"bounding_poly\"\n", + " ][\"normalized_vertices\"]\n", + " y_min_2 = min(vertex[\"y\"] for vertex in norm_temp_2)\n", + " y_max_2 = max(vertex[\"y\"] for vertex in norm_temp_2)\n", + " region.append({\"min_y\": y_max_1, \"max_y\": y_min_2})\n", + " region_line_item.append(({\"min_y_1\": y_min_1, \"min_y_2\": y_min_2}))\n", + " else:\n", + " if max_stop_y != 1:\n", + " region.append({\"min_y\": y_max_1, \"max_y\": max_stop_y - 0.01})\n", + " region_line_item.append(({\"min_y_1\": y_min_1, \"min_y_2\": max_stop_y}))\n", + " line_desc_bw_regions = []\n", + " for reg in region:\n", + " temp_text = \"\"\n", + " desc_text_anc = []\n", + " desc_page_anc_xy = {\"x\": [], \"y\": []}\n", + " for page in json_dict.pages:\n", + " if int(page_num) == int(page.page_number - 1):\n", + " for token1 in page.tokens:\n", + " (\n", + " min_x_token_1,\n", + " min_y_token_1,\n", + " max_x_token_1,\n", + " max_y_token_1,\n", + " ) = get_token_xy(token1)\n", + " if (\n", + " min_y_token_1 >= reg[\"min_y\"] - 0.005\n", + " and max_y_token_1 <= reg[\"max_y\"] + 0.005\n", + " ):\n", + " for seg in token1.layout.text_anchor.text_segments:\n", + " end_index = seg.end_index\n", + " start_index = seg.start_index\n", + " temp_text = (\n", + " temp_text\n", + " + json_dict.text[int(start_index) : int(end_index)]\n", + " )\n", + " desc_text_anc.append(\n", + " {\"start_index\": start_index, \"end_index\": end_index}\n", + " )\n", + " desc_page_anc_xy[\"x\"].extend([min_x_token_1, max_x_token_1])\n", + " desc_page_anc_xy[\"y\"].extend([min_y_token_1, max_y_token_1])\n", + " if temp_text != \"\":\n", + " norm_vertices_1 = [\n", + " {\"x\": min(desc_page_anc_xy[\"x\"]), \"y\": min(desc_page_anc_xy[\"y\"])},\n", + " {\"x\": min(desc_page_anc_xy[\"x\"]), \"y\": max(desc_page_anc_xy[\"y\"])},\n", + " {\"x\": max(desc_page_anc_xy[\"x\"]), \"y\": min(desc_page_anc_xy[\"y\"])},\n", + " {\"x\": max(desc_page_anc_xy[\"x\"]), \"y\": max(desc_page_anc_xy[\"y\"])},\n", + " ]\n", + " sub_ent_desc = get_entity_new(\n", + " temp_text,\n", + " norm_vertices_1,\n", + " desc_text_anc,\n", + " \"line_item/description\",\n", + " False,\n", + " )\n", + " line_desc_bw_regions.append(sub_ent_desc)\n", + "\n", + " for reg3 in region_line_item:\n", + " for line_5 in line_items_temp:\n", + " norm_temp_line5 = line_5[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"][\n", + " \"normalized_vertices\"\n", + " ]\n", + " y_min_line_5 = min(vertex[\"y\"] for vertex in norm_temp_line5)\n", + " y_max_line_5 = max(vertex[\"y\"] for vertex in norm_temp_line5)\n", + " for line_desc in line_desc_bw_regions:\n", + " norm_temp_desc_2 = line_desc[\"page_anchor\"][\"page_refs\"][0][\n", + " \"bounding_poly\"\n", + " ][\"normalized_vertices\"]\n", + " y_min_desc = min(vertex[\"y\"] for vertex in norm_temp_desc_2)\n", + " y_max_desc = max(vertex[\"y\"] for vertex in norm_temp_desc_2)\n", + " if (\n", + " y_min_desc >= reg3[\"min_y_1\"] - 0.01\n", + " and y_max_desc <= reg3[\"min_y_2\"] + 0.01\n", + " and y_min_line_5 >= reg3[\"min_y_1\"] - 0.01\n", + " and y_max_line_5 <= reg3[\"min_y_2\"] + 0.01\n", + " ):\n", + " # print(line_desc['mention_text'])\n", + " line_5[\"properties\"].append(line_desc)\n", + "\n", + " line_desc_bw_regions.remove(line_desc)\n", + "\n", + " for line_fin in line_items_temp:\n", + " temp_text_2 = \"\"\n", + " temp_text_anc_2 = []\n", + " temp_page_anc_xy_2 = {\"x\": [], \"y\": []}\n", + " for subline in line_fin[\"properties\"]:\n", + " for an5 in subline[\"text_anchor\"][\"text_segments\"]:\n", + " temp_text_anc_2.append(an5)\n", + " for xy2 in subline[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"][\n", + " \"normalized_vertices\"\n", + " ]:\n", + " temp_page_anc_xy_2[\"x\"].append(xy2[\"x\"])\n", + " temp_page_anc_xy_2[\"y\"].append(xy2[\"y\"])\n", + " # print(temp_text_anc_2)\n", + " sorted_temp_text_anc_2 = sorted(\n", + " temp_text_anc_2, key=lambda x: int(x[\"end_index\"])\n", + " )\n", + " temp_done_anc = []\n", + " for index_4 in sorted_temp_text_anc_2:\n", + " if index_4 not in temp_done_anc:\n", + " temp_text_2 = (\n", + " temp_text_2\n", + " + json_dict.text[\n", + " int(index_4[\"start_index\"]) : int(index_4[\"end_index\"])\n", + " ]\n", + " )\n", + " temp_done_anc.append(index_4)\n", + " min_x_line_fin, min_y_line_fin, max_x_line_fin, max_y_line_fin = (\n", + " min(temp_page_anc_xy_2[\"x\"]),\n", + " min(temp_page_anc_xy_2[\"y\"]),\n", + " max(temp_page_anc_xy_2[\"x\"]),\n", + " max(temp_page_anc_xy_2[\"y\"]),\n", + " )\n", + " line_fin[\"page_anchor\"][\"page_refs\"][0][\"bounding_poly\"][\n", + " \"normalized_vertices\"\n", + " ] = [\n", + " {\"x\": min_x_line_fin, \"y\": min_y_line_fin},\n", + " {\"x\": max_x_line_fin, \"y\": min_y_line_fin},\n", + " {\"x\": min_x_line_fin, \"y\": max_y_line_fin},\n", + " {\"x\": max_x_line_fin, \"y\": max_y_line_fin},\n", + " ]\n", + " line_fin[\"text_anchor\"][\"text_segments\"] = sorted_temp_text_anc_2\n", + " line_fin[\"mention_text\"] = temp_text_2\n", + "\n", + " # pprint(line_fin)\n", + "\n", + " return line_items_temp\n", + "\n", + "\n", + "file_names_list, file_dict = file_names(gcs_input_path)\n", + "bucket_name = gcs_input_path.split(\"/\")[2]\n", + "for filename, filepath in tqdm(file_dict.items(), desc=\"Progress\"):\n", + " input_bucket_name = gcs_input_path.split(\"/\")[2]\n", + " if \".json\" in filepath:\n", + " json_dict = documentai_json_proto_downloader(bucket_name, filepath)\n", + " page_wise_ent = get_page_wise_entities(json_dict)\n", + " line_item_entities = []\n", + " for page, ent in page_wise_ent.items():\n", + " ent_min_dict = get_text_anc_headers(\n", + " json_dict, page, headers, headers_entities=headers_entities\n", + " )\n", + " try:\n", + " y_max_stop = get_text_anc_headers(json_dict, page, stop_word)\n", + " for stop, ver in y_max_stop.items():\n", + " max_stop_y = ver[\"max_y\"]\n", + " except:\n", + " max_stop_y = 1\n", + " line_items_temp = tag_ref_child_item(\n", + " json_dict, page, ent_min_dict, consider_ent, max_stop_y\n", + " )\n", + " line_items_temp_1 = tagging_rest_child(\n", + " json_dict,\n", + " page,\n", + " line_items_temp,\n", + " headers_entities,\n", + " ent_min_dict,\n", + " consider_ent,\n", + " )\n", + " line_items_temp_page = tag_description_bw_regions(\n", + " json_dict, page, line_items_temp_1, max_stop_y\n", + " )\n", + " for line_temp_ent5 in line_items_temp_page:\n", + " line_item_entities.append(line_temp_ent5)\n", + "\n", + " if line_item_entities != []:\n", + " final_entities = []\n", + " for entity in json_dict.entities:\n", + " if entity.type != \"line_item\":\n", + " final_entities.append(entity)\n", + " for line_ent in line_item_entities:\n", + " final_entities.append(line_ent)\n", + " json_dict.entities = final_entities\n", + " else:\n", + " print(\"No change in the file\")\n", + " store_document_as_json(\n", + " documentai.Document.to_json(json_dict),\n", + " gcs_output_path.split(\"/\")[2],\n", + " (\"/\").join(gcs_output_path.split(\"/\")[3:]) + \"/\" + filename,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "835e86d6-a433-4803-b9a8-87aa9adada19", + "metadata": {}, + "source": [ + "## OUTPUT\n", + "\n", + "* Before and after the postprocessing code\n", + "\n", + "* Before post processing code\n", + "\n", + "\n", + " \n", + "* After using Post processing code\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf09348d-0c84-4e40-9269-d69d74b8d0f6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "conda-root-py", + "name": "workbench-notebooks.m113", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel) (Local)", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/synonyms_based_splitter_document_labeling/Images/cde_entity.png b/incubator-tools/synonyms_based_splitter_document_labeling/Images/cde_entity.png new file mode 100644 index 000000000..6f804fcc9 Binary files /dev/null and b/incubator-tools/synonyms_based_splitter_document_labeling/Images/cde_entity.png differ diff --git a/incubator-tools/synonyms_based_splitter_document_labeling/Images/folders.png b/incubator-tools/synonyms_based_splitter_document_labeling/Images/folders.png new file mode 100644 index 000000000..a488e5f4a Binary files /dev/null and b/incubator-tools/synonyms_based_splitter_document_labeling/Images/folders.png differ diff --git a/incubator-tools/synonyms_based_splitter_document_labeling/Images/pdf_split.png b/incubator-tools/synonyms_based_splitter_document_labeling/Images/pdf_split.png new file mode 100644 index 000000000..3cb0b3e7c Binary files /dev/null and b/incubator-tools/synonyms_based_splitter_document_labeling/Images/pdf_split.png differ diff --git a/incubator-tools/synonyms_based_splitter_document_labeling/Images/splitter_entity.png b/incubator-tools/synonyms_based_splitter_document_labeling/Images/splitter_entity.png new file mode 100644 index 000000000..c42611f3d Binary files /dev/null and b/incubator-tools/synonyms_based_splitter_document_labeling/Images/splitter_entity.png differ diff --git a/incubator-tools/synonyms_based_splitter_document_labeling/README.md b/incubator-tools/synonyms_based_splitter_document_labeling/README.md new file mode 100644 index 000000000..171d903a2 --- /dev/null +++ b/incubator-tools/synonyms_based_splitter_document_labeling/README.md @@ -0,0 +1,45 @@ +# Purpose and Description + +The document will guide to label the documents which can be used by custom document splitter parser using synonyms list and OCR parsed jsons. +There is also an optional flag to split the pdfs and save into the GCS folders(named as labels). + +Note: +If the synonyms are not unique or exists in more than one page then the synonym which is mentioned first in the list will +be considered and entity will be added by type found lowest index in the list' + +EXAMPLE: synonyms_list=['PART A','PART B','PART C','PART D'] + +If PART A and PART B both are found in any of the pages then we consider the type of that page as PART A as +it has the lowest index in the list( FIRST element will be given priority as per code) + +## Input Details +* **project_id** : Your Google project id or name +* **synonyms_list** : list of synonyms which has to be used to search in ocr for splitting documents +* **gcs_input_uri** : OCR PARSED JSONS RESULTS PATH +* **gcs_output_uri** : Path to save the updated jsons +* **save_split_pdfs_flag** : flag whether to save the splitted pdfs in gcs bucket +* **pdfs_ouput_path** : path to save the split files +* **synonym_entity_name** : type of entity to view in cde +* **label_unidentified_entity_name** : default label name in case first few pages no synonym found + +## Output Details + +Entities will be added to jsons and saved in the output gcs path + +* CDE format entities with entity type as **synonym_entity** as shown below + +cde_entity + +* Splitter format entities added with entity type same as labels or synonyms given + +splitter_entity + +* If save_split_pdfs_flag is TRUE , then the split pdfs will be saved in gcs path provided with folder names same as labels + +folders_image + +If the documents doesnt have any synonyms given , then it will be saved in label_unidentified folder. + +The names of files will be **filename+timestamp.pdf** + +pdf_split diff --git a/incubator-tools/synonyms_based_splitter_document_labeling/synonyms_based_splitter_document_labeling.ipynb b/incubator-tools/synonyms_based_splitter_document_labeling/synonyms_based_splitter_document_labeling.ipynb new file mode 100644 index 000000000..fedb7d91b --- /dev/null +++ b/incubator-tools/synonyms_based_splitter_document_labeling/synonyms_based_splitter_document_labeling.ipynb @@ -0,0 +1,711 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "55646894-baf1-4c1c-9790-5fc16468a282", + "metadata": {}, + "source": [ + "# Synonyms Based Splitter Document Labeling " + ] + }, + { + "cell_type": "markdown", + "id": "9c1e5f0d-91ea-4766-8ab3-f66429e66e1b", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "528c5a36-a860-468f-a30a-08d051880aee", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "7374f665-15de-4292-b045-cf6acc14fa10", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This notebook automates document labeling using a synonyms-based approach for a Custom Document Splitter Parser. By comparing a user-defined list of keywords against OCR-extracted text, it identifies and labels document segments, enhancing document organization and categorization.There is also an optional flag to split the pdfs and save into the GCS folders(named as labels).\n", + "\n", + "In this context, \"synonyms\" refer to a set of keywords that match text extracted from documents. By using these keywords, the tool searches the OCR text to create splitter entities, which are markers used to categorize and split the document based on identified keywords.\n", + "\n", + "### Practical Application\n", + "\n", + "The tool labels document sections by searching OCR text for user-provided synonyms, streamlining the process of splitting and categorizing documents based on their content.\n", + "\n", + "### Examples\n", + "\n", + "- **Example 1:** With `synonyms_list=['PART A','PART B','PART C','PART D']`, if both \"PART A\" and \"PART B\" are found, the page is labeled as \"PART A\".\n", + "- **Example 2:** For `synonyms_list=['INTRODUCTION', 'EXECUTIVE SUMMARY', 'CONCLUSION']`, if \"EXECUTIVE SUMMARY\" and \"CONCLUSION\" are found, it labels the page as \"EXECUTIVE SUMMARY\".\n", + "\n", + "Priority is given to the synonym appearing first in the list when multiple matches occur on a page, ensuring consistent labeling." + ] + }, + { + "cell_type": "markdown", + "id": "3b5734e7-8944-485a-9412-ae063ef57318", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "* Python : Jupyter notebook (Vertex AI) \n", + "* Service account permissions in projects.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "ce627c16-65b6-4870-b0b7-28d5fe599905", + "metadata": {}, + "source": [ + "## Step by Step procedure \n", + "\n", + "### 1.Importing Required Modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d1964d8-b1e4-42b7-a399-40861581c371", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f1928e8-b562-443d-b17d-88f42cebda45", + "metadata": {}, + "outputs": [], + "source": [ + "#importing libraries\n", + "import re\n", + "from utilities import *\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "import json\n", + "from pathlib import Path\n", + "from typing import Any, Dict, List, Optional, Sequence, Tuple, Union\n", + "from google.cloud import storage\n", + "from google.cloud.exceptions import Conflict, NotFound\n", + "from PIL import Image\n", + "import io" + ] + }, + { + "cell_type": "markdown", + "id": "9fdcdd55-89b2-4b67-838e-d82fc5c2d957", + "metadata": {}, + "source": [ + "### 2.Setup the required inputs\n", + "* `project_id` : Your Google project id or name\n", + "* `synonyms_list` : list of synonyms which has to be used to search in ocr for splitting documents\n", + "* `gcs_input_uri` : OCR PARSED JSONS RESULTS PATH\n", + "* `gcs_output_uri` : Path to save the updated jsons\n", + "* `save_split_pdfs_flag` : flag whether to save the splitted pdfs in gcs bucket\n", + "* `pdfs_ouput_path` : path to save the split files\n", + "* `synonym_entity_name` : type of entity to view in cde \n", + "* `label_unidentified_entity_name` : default label name in case first few pages no synonym found\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d717d6b4-4bdd-4d04-b9e4-da57f4dd008f", + "metadata": {}, + "outputs": [], + "source": [ + "project_id='xxxx-xxxx-xxxx' \n", + "synonyms_list=['PART A','PART B','PART C','PART D'] \n", + "gcs_input_uri=\"gs://xxxx/xxxx/xxxx/\"\n", + "gcs_output_uri='gs://xxxx/xxxx/xxx/' \n", + "save_split_pdfs_flag='TRUE'\n", + "pdfs_ouput_path='gs://xxxx/xxxx/xxx/' \n", + "synonym_entity_name='synonym_entity' \n", + "label_unidentified_entity_name=\"label_unidentified\"" + ] + }, + { + "cell_type": "markdown", + "id": "cd83c5f9-8bf3-460e-ae3d-177ce642c415", + "metadata": {}, + "source": [ + "### Function to parse the Raw pdfs upto 200 pages in a single json\n", + "Use the below function to parse the documents which has more than 10 pages to get the output in a single json and provide the path in the `gcs_input_uri` as given in above cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d957fca3-7d5e-4a24-bd39-14750ac80130", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# BATCH PROCESSING FUNCTION WITH SHARDING TILL 200 pages\n", + "def batch_process_documents(\n", + " project_id : str,\n", + " location : str,\n", + " processor_id : str,\n", + " gcs_input_uri : str,\n", + " gcs_output_uri : str,\n", + " timeout: int = 600,\n", + ") -> Any:\n", + " \"\"\"It will perform Batch Process on raw input documents\n", + "\n", + " Args:\n", + " project_id (str): GCP project ID\n", + " location (str): Processor location us or eu\n", + " processor_id (str): GCP DocumentAI ProcessorID\n", + " gcs_input_uri (str): GCS path which contains all input files\n", + " gcs_output_uri (str): GCS path to store processed JSON results\n", + " timeout (int, optional): Maximum waiting time for operation to complete.\n", + "\n", + " Returns:\n", + " operation.Operation: LRO operation ID for current batch-job\n", + " \"\"\"\n", + "\n", + " # You must set the api_endpoint if you use a location other than 'us', e.g.:\n", + " opts = {}\n", + " if location == \"eu\":\n", + " opts = {\"api_endpoint\": \"eu-documentai.googleapis.com\"}\n", + " elif location == \"us\":\n", + " opts = {\"api_endpoint\": \"us-documentai.googleapis.com\"}\n", + " #opts = {\"api_endpoint\": \"us-autopush-documentai.sandbox.googleapis.com\"}\n", + " client = documentai.DocumentProcessorServiceClient(client_options=opts)\n", + "\n", + "\n", + " input_config= documentai.BatchDocumentsInputConfig(gcs_prefix=documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri))\n", + " \n", + " sharding_config = documentai.DocumentOutputConfig.GcsOutputConfig.ShardingConfig(pages_per_shard=200)\n", + " gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(\n", + " gcs_uri=gcs_output_uri, sharding_config=sharding_config\n", + " )\n", + "\n", + " output_config = documentai.DocumentOutputConfig(\n", + " gcs_output_config=gcs_output_config\n", + " )\n", + "\n", + " # Location can be 'us' or 'eu'\n", + " name = f\"projects/{project_id}/locations/{location}/processors/{processor_id}\"\n", + " request = documentai.types.document_processor_service.BatchProcessRequest(\n", + " name=name,\n", + " input_documents=input_config,\n", + " document_output_config=output_config,\n", + " )\n", + "\n", + " operation = client.batch_process_documents(request)\n", + "\n", + " # Wait for the operation to finish\n", + " operation.result(timeout=timeout)\n", + " return operation" + ] + }, + { + "cell_type": "markdown", + "id": "10d320b8-c2c8-46b9-8c10-33c5147347a6", + "metadata": {}, + "source": [ + "### 3.Importing Required functions and calling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92e49ea6-f84c-47e5-a562-1632f8e6e2a5", + "metadata": {}, + "outputs": [], + "source": [ + "def get_text_anchors_page_wise(json_ocr : object) -> Dict:\n", + " \"\"\"\n", + " Get text anchors for each page in the OCR result.\n", + "\n", + " Args:\n", + " json_ocr (object): The OCR result in Document AI Document format.\n", + "\n", + " Returns:\n", + " Dict : A dictionary where keys are page numbers (0-indexed),\n", + " and values are dictionaries with 'start_index' and 'end_index' for each page's text anchor.\n", + " \"\"\"\n", + " \n", + " #Getting text anchors\n", + " p=0\n", + " text_anchors_page_wise={}\n", + " for page in json_ocr.pages:\n", + " for an in page.layout.text_anchor.text_segments:\n", + " start_index=an.start_index\n", + " end_index=an.end_index\n", + " text_anchors_page_wise[p]={'start_index':start_index,'end_index':end_index}\n", + " p+=1\n", + " return text_anchors_page_wise\n", + "\n", + "#getting text anchors of matches with synonyms\n", + "def find_substring_indexes(text : str, substring : str) -> List[Union[int, int]]:\n", + " \"\"\"\n", + " Find the starting and ending indexes of occurrences of a substring in the given text.\n", + "\n", + " Args:\n", + " text (str): The input text where substring needs to be found.\n", + " substring (str): The substring to be searched in the text.\n", + "\n", + " Returns:\n", + " List[Union[int, int]]: A list of tuples containing the starting and ending indexes of substring occurrences.\n", + " \"\"\"\n", + " \n", + " if ' ' or '\\n' not in substring:\n", + " pattern = re.compile(re.escape(substring), re.IGNORECASE)\n", + " matches = [(match.start(), match.end()) for match in pattern.finditer(text)]\n", + " else:\n", + " pattern = re.compile(r'{}.*{}'.format(re.escape(substring.split(' ')[0]),re.escape(substring.split(' ')[-1])), re.IGNORECASE)\n", + " matches = [(match.start(), match.end()) for match in pattern.finditer(json_dict['text'])]\n", + "\n", + " return matches\n", + "\n", + "def get_synonyms_matches_pages(synonyms_list : List[str], text_anchors_page_wise : Dict[int, Dict[str, int]], json_ocr : object) -> Tuple:\n", + " \"\"\"\n", + " Find matches of synonyms in the OCR text and associate them with corresponding pages.\n", + "\n", + " Args:\n", + " synonyms_list (List[str]): List of synonyms to be searched in the OCR text.\n", + " text_anchors_page_wise (Dict[int, Dict[str, int]]): Text anchors with start and end indexes for each page.\n", + " json_ocr (object): JSON representation of the OCR output.\n", + "\n", + " Returns:\n", + " Tuple : A tuple containing:\n", + " - A dictionary with synonyms as keys and lists of pages where they are found, sorted in ascending order.\n", + " - A dictionary with synonym information, including text anchors and corresponding pages.\n", + " \"\"\"\n", + " \n", + " matches_synonyms={}\n", + " synonym_info={}\n", + " for synonym in synonyms_list:\n", + " pattern = re.compile('[^a-zA-Z0-9\\s]')\n", + " matches_list=find_substring_indexes(re.sub(pattern, ' ', json_ocr.text),re.sub(pattern, ' ', synonym))#find_substring_indexes(json_ocr.text, synonym)\n", + " # print(matches_list)\n", + " for match in matches_list:\n", + " for p1,anc in text_anchors_page_wise.items():\n", + " if match[0]>=anc['start_index'] and match[1]<=anc['end_index']:\n", + " if synonym in matches_synonyms.keys():\n", + " matches_synonyms[synonym].append(p1)\n", + " synonym_info[synonym].append({'text_anchors':{'start_index':match[0],'end_index':match[1]},'page':p1})\n", + " else:\n", + " matches_synonyms[synonym]=[p1]\n", + " synonym_info[synonym]=[{'text_anchors':{'start_index':match[0],'end_index':match[1]},'page':p1}]\n", + " matches_synonyms_updated = {key: sorted(list(set(value))) for key, value in matches_synonyms.items()}\n", + " synonym_wise_data={}\n", + " temp_pages=[]\n", + " temp_permanant=[]\n", + " temp_page=-1\n", + " asssigned_pages=list(set(value for values_list in matches_synonyms_updated.values() for value in values_list))\n", + " unassigned_pages=[]\n", + " synonym_assigned=''\n", + " for page_num in range(len(json_ocr.pages)):\n", + " asssigned_flag='NO'\n", + " for synonym_1,pages_available in matches_synonyms_updated.items():\n", + " if synonym_assigned=='':\n", + " synonym_assigned=synonym_1\n", + " if page_num in pages_available:\n", + " if temp_page0:\n", + " synonym_wise_data[synonym_1].append(temp_pages)\n", + " temp_permanant.append(temp_pages)\n", + " elif len(temp_pages)>0:\n", + " synonym_wise_data[synonym_1]=[temp_pages]\n", + " temp_pages=[]\n", + " asssigned_flag='YES'\n", + " if asssigned_flag=='NO':\n", + " unassigned_pages.append(page_num)\n", + "\n", + " for unass_page in unassigned_pages:\n", + " closest_list=''\n", + " closest_synonym=''\n", + " min_diff=100\n", + " for syn_2,pagass_list in synonym_wise_data.items():\n", + " for pag_ass in pagass_list:\n", + " for p_n1 in pag_ass:\n", + " if p_n1unass_page-p_n1:\n", + " min_diff=unass_page-p_n1\n", + " closest_list=pag_ass\n", + " closest_synonym=syn_2\n", + " else:\n", + " continue\n", + " if closest_synonym!='':\n", + " #print(closest_list)\n", + " for syn_2,pagass_list in synonym_wise_data.items():\n", + " for pag_ass in pagass_list:\n", + " if syn_2==closest_synonym and pag_ass==closest_list:\n", + " pag_ass.append(unass_page)\n", + " else:\n", + " if label_unidentified_entity_name in synonym_wise_data.keys():\n", + " synonym_wise_data[label_unidentified_entity_name].append([unass_page])\n", + " else:\n", + " synonym_wise_data[label_unidentified_entity_name]=[[unass_page]]\n", + "\n", + "\n", + " data = {part: [list(set(sublist)) for sublist in lists] for part, lists in synonym_wise_data.items()}\n", + " for part, part_data in data.items():\n", + " for sublist in part_data:\n", + " sublist.sort()\n", + " return data,synonym_info\n", + "\n", + "def remove_repeated_pages(data : Dict[str, List[List[int]]]) -> Dict[str, List[List[int]]]:\n", + " \"\"\"\n", + " Remove repeated page numbers from the provided data.\n", + "\n", + " Args:\n", + " data (Dict[str, List[List[int]]]): Dictionary containing part-wise data with lists of page numbers.\n", + "\n", + " Returns:\n", + " Dict[str, List[List[int]]]: Modified dictionary with repeated page numbers removed from the lists.\n", + " \"\"\"\n", + " \n", + " all_numbers = []\n", + " unique_numbers = set()\n", + " repeated_numbers = set()\n", + "\n", + " for part_data in data.values():\n", + " for item in part_data:\n", + " if isinstance(item, list):\n", + " flat_list = item\n", + " else:\n", + " flat_list = [item]\n", + " all_numbers.extend(flat_list)\n", + "\n", + " for num in all_numbers:\n", + " if num in unique_numbers:\n", + " repeated_numbers.add(num)\n", + " else:\n", + " unique_numbers.add(num)\n", + " \n", + " for part, part_data in data.items():\n", + " for sublist in part_data:\n", + " if isinstance(sublist, list) and len(sublist) > 1 and sublist[0] in repeated_numbers:\n", + " sublist.pop(0)\n", + " return data\n", + "\n", + "def store_blob(bytes_stream: bytes, file: str ,BUCKET_NAME: str) -> None:\n", + " \"\"\"To store PDF files in GCS\n", + "\n", + " Args:\n", + " bytes_stream (bytes): Binary Format of pdf data\n", + " file (str): filename to store in specified GCS bucket\n", + " \"\"\"\n", + "\n", + " storage_client = storage.Client()\n", + " result_bucket = storage_client.get_bucket(BUCKET_NAME)\n", + " document_blob = storage.Blob(name=str(file), bucket=result_bucket)\n", + " document_blob.upload_from_string(bytes_stream, content_type=\"application/pdf\")\n", + "\n", + "def save_split_pdfs(json_ocr : object, pdfs_ouput_path : str, file_name : str, synonym_tag : Optional[str]) -> None:\n", + " \"\"\"\n", + " Save split PDFs based on OCR data.\n", + "\n", + " Args:\n", + " json_ocr (object): JSON OCR data.\n", + " pdfs_ouput_path (str): Output path for storing the PDFs.\n", + " file_name (str): Base file name for the saved PDFs.\n", + " synonym_tag (Optional[str]): 'YES' if synonym tagging, 'NO' otherwise.\n", + "\n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " \n", + " if synonym_tag=='YES':\n", + " for entity in json_ocr.entities:\n", + " if entity.type!=synonym_entity_name:\n", + " pages_new=[]\n", + " for p_num in entity.page_anchor.page_refs:\n", + " # print(p_num)\n", + " pages_new.append(p_num.page)\n", + " pages_image=[]\n", + " for page_num1 in range(len(json_ocr.pages)):\n", + " if page_num1 in pages_new:\n", + " pages_image.append(json_ocr.pages[page_num1].image.content)\n", + " folder_name=entity.type\n", + " synthesized_images = [decode_image(page) for page in pages_image]\n", + " pdf_bytes = create_pdf_from_images(synthesized_images)\n", + " from datetime import datetime\n", + " current_time = datetime.now()\n", + " time_stamp_1=int(current_time.timestamp())\n", + " file_save_path=('/').join(pdfs_ouput_path.split('/')[3:])+str(folder_name)+'/'+file_name+'_'+str(time_stamp_1)+'.pdf'\n", + " BUCKET_NAME=pdfs_ouput_path.split('/')[2]\n", + " store_blob(bytes_stream= pdf_bytes, file=file_save_path ,BUCKET_NAME=BUCKET_NAME)\n", + "\n", + " elif synonym_tag=='NO':\n", + " synthesized_images = [decode_image(page.image.content) for page in json_ocr.pages]\n", + " pdf_bytes = create_pdf_from_images(synthesized_images)\n", + " from datetime import datetime\n", + " current_time = datetime.now()\n", + " time_stamp_1=int(current_time.timestamp())\n", + " file_save_path=('/').join(pdfs_ouput_path.split('/')[3:])+label_unidentified_entity_name+'/'+file_name+'_'+str(time_stamp_1)+'.pdf'\n", + " BUCKET_NAME=pdfs_ouput_path.split('/')[2]\n", + " store_blob(bytes_stream= pdf_bytes, file=file_save_path ,BUCKET_NAME=BUCKET_NAME)\n", + "\n", + "def decode_image(image_bytes: bytes) -> Image.Image:\n", + " \"\"\"\n", + " Decode image bytes into a Pillow Image object.\n", + "\n", + " Args:\n", + " image_bytes (bytes): The image bytes to be decoded.\n", + "\n", + " Returns:\n", + " Image.Image: The Pillow Image object.\n", + " \"\"\"\n", + " \n", + " with io.BytesIO(image_bytes) as image_file:\n", + " image = Image.open(image_file)\n", + " image.load()\n", + " return image\n", + "\n", + "def create_pdf_from_images(images: Sequence[Image.Image]) -> bytes:\n", + " \"\"\"Creates a PDF from a sequence of images.\n", + "\n", + " The PDF will contain 1 page per image, in the same order.\n", + "\n", + " Args:\n", + " images: A sequence of images.\n", + "\n", + " Returns:\n", + " The PDF bytes.\n", + " \"\"\"\n", + " \n", + " if not images:\n", + " raise ValueError(\"At least one image is required to create a PDF\")\n", + "\n", + " # PIL PDF saver does not support RGBA images\n", + " images = [\n", + " image.convert(\"RGB\") if image.mode == \"RGBA\" else image for image in images\n", + " ]\n", + "\n", + " with io.BytesIO() as pdf_file:\n", + " images[0].save(\n", + " pdf_file, save_all=True, append_images=images[1:], format=\"PDF\"\n", + " )\n", + " return pdf_file.getvalue()\n", + " \n", + "def create_splitter_entities(json_ocr : object, synonyms_list : List[str], synonym_entity_name : str) -> Tuple[object, str]:\n", + " \"\"\"\n", + " Creates splitter entities based on the identified synonyms in the OCR output.\n", + "\n", + " Args:\n", + " json_ocr (object): The OCR output in the form of a Document AI document.\n", + " synonyms_list (List[str]): List of synonyms to be identified in the OCR output.\n", + " synonym_entity_name (str): Name to be assigned to the entity representing identified synonyms.\n", + "\n", + " Returns:\n", + " Tuple[object, str]: A tuple containing the updated OCR document and a tag indicating\n", + " whether synonyms were found ('YES') or not ('NO').\n", + " \"\"\"\n", + " \n", + " text_anchors_page_wise=get_text_anchors_page_wise(json_ocr)\n", + " synonyms_pages,synonym_info=get_synonyms_matches_pages(synonyms_list,text_anchors_page_wise,json_ocr)\n", + " if len(synonyms_pages)>0:\n", + " data=remove_repeated_pages(synonyms_pages)\n", + " max_page=len(text_anchors_page_wise)\n", + " max_page_list = max([max(sublist, default=0) for sublist in sum(data.values(), [])])\n", + " entities_splitter=[]\n", + " for synonym,pages_nested_list in data.items():\n", + " for pages_list in pages_nested_list:\n", + " temp_splitter_entity={'type_':'','text_anchor':{'text_segments':[]},'page_anchor':{'page_refs':[]}}\n", + " # if max_page_list in pages_list:\n", + " # pages_list.extend(range(max_page_list, max_page))\n", + " # # print(pages_list)\n", + " if len(pages_list)>=1:\n", + " sorted_pages=sorted(pages_list)\n", + " start_index_ent=text_anchors_page_wise[sorted_pages[0]]['start_index']\n", + " end_index_ent=text_anchors_page_wise[sorted_pages[-1]]['end_index']\n", + " for page_2 in pages_list:\n", + " temp_splitter_entity['page_anchor']['page_refs'].append({'page':page_2})\n", + " temp_splitter_entity['text_anchor']['text_segments'].append({'start_index':start_index_ent,'end_index':end_index_ent})\n", + " temp_splitter_entity['type_']=''.join(['_' if c.isspace() or not c.isalnum() else c for c in synonym])\n", + " entities_splitter.append(temp_splitter_entity)\n", + " \n", + " json_ocr.entities=entities_splitter\n", + " json_ocr=add_ent_json(json_ocr,synonym_info,synonym_entity_name)\n", + " synonym_tag='YES'\n", + " else:\n", + " # print('NO OUTPUT')\n", + " synonym_tag='NO'\n", + " \n", + " return json_ocr,synonym_tag\n", + "\n", + "def get_new_entity(syn_data_1 : Dict[str, Any], json_ocr : object, synonym_entity_name : str) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Creates a new entity based on the provided synonym data and the OCR output.\n", + "\n", + " Args:\n", + " syn_data_1 (Dict[str, Any]): Information about the synonym data, including text anchors and page number.\n", + " json_ocr (object): The OCR output in the form of a Document AI document.\n", + " synonym_entity_name (str): Name to be assigned to the new entity.\n", + "\n", + " Returns:\n", + " Dict[str, Any]: A dictionary representing the new entity with mention text, page anchors, text anchors, and type.\n", + " \"\"\"\n", + " \n", + " text_anchors_temp= syn_data_1['text_anchors']\n", + " page_num=syn_data_1['page']\n", + " # synonym_entity_name='synonym_entity'\n", + " new_ent={'mention_text':'','page_anchor':{'page_refs':[{'bounding_poly':{'normalized_vertices':[]},'page': page_num}]},'text_anchor':{'text_segments': []},'type_':''}\n", + " entity_text_anc=[]\n", + " page_anc={'x':[],'y':[]}\n", + " for page in json_ocr.pages:\n", + " # print(page.page_number)\n", + " if page_num==page.page_number-1:\n", + " # print(page.page_number)\n", + " for token in page.tokens:\n", + " # print(token)\n", + " token_seg=token.layout.text_anchor.text_segments\n", + " for seg in token_seg:\n", + " token_start=seg.start_index\n", + " token_end=seg.end_index\n", + " if token_start>=text_anchors_temp['start_index']-3 and token_end<=text_anchors_temp['end_index']+2:\n", + " if json_ocr.text[token_start:token_end].replace(\" \", \"\") in json_ocr.text[text_anchors_temp['start_index']:text_anchors_temp['end_index']].replace(\" \", \"\"):\n", + " vertices = token.layout.bounding_poly.normalized_vertices\n", + " minx_token, miny_token = min(point.x for point in vertices), min(point.y for point in vertices)\n", + " maxx_token, maxy_token = max(point.x for point in vertices), max(point.y for point in vertices)\n", + " entity_text_anc.append({'start_index':token_start,'end_index':token_end})\n", + " page_anc['x'].extend([minx_token,maxx_token])\n", + " page_anc['y'].extend([miny_token,maxy_token])\n", + " new_ent['mention_text']=json_ocr.text[text_anchors_temp['start_index']:text_anchors_temp['end_index']]\n", + " page_anchors_ent=[{'x':min(page_anc['x']),'y':min(page_anc['y'])},{'x':min(page_anc['x']),'y':max(page_anc['y'])},\n", + " {'x':max(page_anc['x']),'y':min(page_anc['y'])},{'x':max(page_anc['x']),'y':max(page_anc['y'])}]\n", + " new_ent['page_anchor']['page_refs'][0]['bounding_poly']['normalized_vertices']=page_anchors_ent\n", + " new_ent['text_anchor']['text_segments']=entity_text_anc\n", + " new_ent['type_']=synonym_entity_name\n", + " \n", + " return new_ent\n", + "\n", + "def create_cde_entities(synonym_info : Dict[str, List[Dict[str, Any]]], json_ocr : object, synonym_entity_name : str) ->List[Dict[str, Any]]:\n", + " \"\"\"\n", + " Creates CDE entities based on the synonym information and OCR output.\n", + "\n", + " Args:\n", + " synonym_info (Dict[str, List[Dict[str, Any]]]): Information about synonyms and their occurrences in the OCR output.\n", + " json_ocr (object): The OCR output in the form of a Document AI document.\n", + " synonym_entity_name (str): Name to be assigned to the synonym entity.\n", + "\n", + " Returns:\n", + " List[Dict[str, Any]]: A list of dictionaries representing CDE entities with mention text, page anchors, text anchors, and type.\n", + " \"\"\"\n", + " \n", + " cde_entities=[] \n", + " for syn,tag in synonym_info.items():\n", + " for item in tag:\n", + " try:\n", + " cde_ent=get_new_entity(item,json_ocr,synonym_entity_name)\n", + " cde_entities.append(cde_ent)\n", + " except:\n", + " continue\n", + " return cde_entities\n", + "\n", + "\n", + "def add_ent_json(json_ocr : object, synonym_info : Dict[str, List[Dict[str, Any]]], synonym_entity_name : str) -> object:\n", + " \"\"\"\n", + " Adds CDE entities to the Document AI document based on synonym information.\n", + "\n", + " Args:\n", + " json_ocr (object): The OCR output in the form of a Document AI document.\n", + " synonym_info (Dict[str, List[Dict[str, Any]]]): Information about synonyms and their occurrences in the OCR output.\n", + " synonym_entity_name (str): Name to be assigned to the synonym entity.\n", + "\n", + " Returns:\n", + " object : The updated Document AI document with added CDE entities.\n", + " \"\"\"\n", + " \n", + " cde_entities=create_cde_entities(synonym_info,json_ocr,synonym_entity_name)\n", + " for ent_cde in cde_entities:\n", + " json_ocr.entities.append(ent_cde)\n", + " \n", + " return json_ocr\n", + "\n", + "def main():\n", + " files_name_list,files_path_dict=file_names(gcs_input_uri)\n", + " for i in range(len(files_name_list)):\n", + " #print(file_name_list[i])\n", + " file_path='gs://'+gcs_input_uri.split('/')[2]+'/'+files_path_dict[files_name_list[i]]\n", + " print(file_path)\n", + " json_ocr=documentai_json_proto_downloader(file_path.split('/')[2],('/').join(file_path.split('/')[3:]))\n", + " json_ocr,synonym_tag=create_splitter_entities(json_ocr,synonyms_list,synonym_entity_name)\n", + " if save_split_pdfs_flag=='TRUE':\n", + " save_split_pdfs(json_ocr,pdfs_ouput_path,files_name_list[i],synonym_tag)\n", + " store_document_as_json(documentai.Document.to_json(json_ocr),gcs_output_uri.split('/')[2],('/').join(gcs_output_uri.split('/')[3:])+files_name_list[i])\n", + " \n", + "main()" + ] + }, + { + "cell_type": "markdown", + "id": "b571acbf-420d-4a05-bd7e-a0a4b3de211b", + "metadata": {}, + "source": [ + "### Output\n", + "\n", + "Entities will be added to jsons and saved in the output gcs path\n", + "\n", + "* CDE format entities with entity type as ‘synonym_entity’ as shown below\n", + "\n", + "\n", + "\"cde_entity\"\n", + "\n", + "* Splitter format entities added with entity type same as labels or synonyms given\n", + "\n", + "\"splitter_entity\"\n", + "\n", + "\n", + "* If save_split_pdfs_flag is TRUE , then the split pdfs will be saved in gcs path provided with folder names same as labels\n", + "\n", + "\"folders_image\"\n", + "\n", + "If the documents doesnt have any synonyms given , then it will be saved in label_unidentified folder.\n", + "\n", + "The names of files will be `filename’+timestamp.pdf`\n", + "\n", + "\"pdf_split\"" + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/incubator-tools/synonyms_entity_tag/README.md b/incubator-tools/synonyms_entity_tag/README.md new file mode 100644 index 000000000..931b97fd3 --- /dev/null +++ b/incubator-tools/synonyms_entity_tag/README.md @@ -0,0 +1,16 @@ +# Purpose and Description +This tool uses parsed json files and a dictionary with key as entity names and values as synonyms for which the entity has to be tagged. +New entities added to the json. + +Approach: The values of the dictionary are searched in the OCR text and tagged with entity name based on key. + +## Input Details + +* **project_id**: It is the project id of the project. +* **gcs_input_path**: GCS Storage name. It should contain DocAI processed output json files. This bucket is used for processing input files and saving output files in the folders. +* **gcs_output_path**: GCS URI of the folder, where the output is stored. +* **synonyms_entities**: A dictionary with key as entity names and values as synonyms for which the entity has to be tagged. + +## Output Details + +The output jsons files will be stored in the given output directory. \ No newline at end of file diff --git a/incubator-tools/synonyms_entity_tag/Synonyms Entity Tag.ipynb b/incubator-tools/synonyms_entity_tag/Synonyms Entity Tag.ipynb new file mode 100644 index 000000000..5a992bfb1 --- /dev/null +++ b/incubator-tools/synonyms_entity_tag/Synonyms Entity Tag.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f2234486-f07d-4fd6-85c7-2aad9e49247f", + "metadata": {}, + "source": [ + "# Doc AI Synonyms Entity Tag" + ] + }, + { + "cell_type": "markdown", + "id": "c2aaf3d8-7924-43ec-937b-ab51cd92a26c", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "420260ad-cfe9-4bc0-9d74-8b568212fa77", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied." + ] + }, + { + "cell_type": "markdown", + "id": "658013b6-09bb-4d9a-87e1-b4fcd35689f8", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This tool uses parsed json files and a dictionary with key as entity names and values as synonyms for which the entity has to be tagged. New entities added to the json. \n", + "\n", + "Approach: The values of the dictionary are searched in the OCR text and tagged with entity name based on key. \n" + ] + }, + { + "cell_type": "markdown", + "id": "3ada8352-3053-4fa9-9247-c0306faf3a78", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "\n", + "* Vertex AI Notebook\n", + "* Parsed json files in GCS Folder\n", + "* Output folder to upload the updated json files\n" + ] + }, + { + "cell_type": "markdown", + "id": "9dc7bbaa-7608-441e-95e4-01435e9db80c", + "metadata": {}, + "source": [ + "## Step by Step procedure" + ] + }, + { + "cell_type": "markdown", + "id": "5ea47bbc-8b14-49cc-9972-34c18eca4d3c", + "metadata": {}, + "source": [ + "### 1.Importing Required Modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6aea6ef8-7875-49b5-8416-d3e64005dde8", + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "345710ab-8453-43d7-876e-a286068c462e", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import storage\n", + "from tqdm import tqdm\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "import json\n", + "import utilities" + ] + }, + { + "cell_type": "markdown", + "id": "4c700b5e-ca96-40c3-8030-aa8d32cb38c2", + "metadata": {}, + "source": [ + "### 2.Setup the Inputs" + ] + }, + { + "cell_type": "markdown", + "id": "3fddb2dd-cdf7-4dcc-a557-666e6e83a30c", + "metadata": {}, + "source": [ + "* `project_id`: It is the project id of the project.\n", + "* `gcs_input_path`: GCS Storage name. It should contain DocAI processed output json files. This bucket is used for processing input files and saving output files in the folders.\n", + "* `gcs_output_path`: GCS URI of the folder, where the output is stored.\n", + "* `synonyms_entities`:A dictionary with key as entity names and values as synonyms for which the entity has to be tagged." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c45c3613-41b5-4342-bb4e-b3651678fec6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# input details\n", + "project_id = \"xxxx-xxxx-xxxx\"\n", + "gcs_input_path = \"gs://xxxx/xxxx/xxx/\"\n", + "gcs_output_path = \"gs://xxxx/xxxx/xxx/\"\n", + "synonyms_entities = {\n", + " \"cust_name\": [\"ROsweLL PARK MEMORIAL\", \"inst\"],\n", + " \"Name\": [\"name\", \"firstname\", \"lastname\", \"middlename\"],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "54de84e0-6d1d-4cbc-b5e4-6b57dd8c325b", + "metadata": {}, + "source": [ + "### 3.Run the Code" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "062cefe8-84c9-46e8-b00e-f4e9badb7903", + "metadata": {}, + "outputs": [], + "source": [ + "def get_normalizedvertices(normalized_vertices: object) -> tuple:\n", + " \"\"\"\n", + " Get the minimum and maximum coordinates from a list of normalized vertices.\n", + "\n", + " Args:\n", + " normalized_vertices (object) : List of normalized vertices.\n", + " Returns:\n", + " tuple: Minimum x, Minimum y, Maximum x, Maximum y coordinates.\n", + " \"\"\"\n", + "\n", + " min_x = min(vertex.x for vertex in normalized_vertices.normalized_vertices)\n", + " min_y = min(vertex.y for vertex in normalized_vertices.normalized_vertices)\n", + " max_x = max(vertex.x for vertex in normalized_vertices.normalized_vertices)\n", + " max_y = max(vertex.y for vertex in normalized_vertices.normalized_vertices)\n", + "\n", + " return min_x, min_y, max_x, max_y\n", + "\n", + "\n", + "def get_token(json_dict: object, page: str, text_anchors_check: list) -> tuple:\n", + " \"\"\"THIS FUNCTION USED LOADED JSON, PAGE NUMBER AND TEXT ANCHORS AS INPUT AND GIVES THE X AND Y COORDINATES\n", + "\n", + " Args:\n", + " json_dict (object) : The document object containing entities.\n", + " page (str) : The page number as a string where these entities are found.\n", + " text_anchors_check (list) : The list contains text anchors information which need to be checked.\n", + " Returns:\n", + " A tuple with three elements : A dictionary with keys 'min_x', 'min_y', 'max_x', and 'max_y' ; list containing textanchors ; confidence\n", + " \"\"\"\n", + " min_x = \"\"\n", + " temp_text_anc = []\n", + " temp_confidence = []\n", + " temp_ver = {\"x\": [], \"y\": []}\n", + " for token in json_dict.pages[page].tokens:\n", + " if not token.layout.text_anchor.text_segments[0].start_index:\n", + " token.layout.text_anchor.text_segments[0].start_index = 0\n", + " token_anc = token.layout.text_anchor.text_segments[0]\n", + " if token.layout.text_anchor.text_segments == text_anchors_check:\n", + " normalized_vertices = token.layout.bounding_poly\n", + " min_x, min_y, max_x, max_y = get_normalizedvertices(normalized_vertices)\n", + " text_anc_token = token.layout.text_anchor.text_segments\n", + " confidence = token.layout.confidence\n", + " elif (\n", + " int(token_anc.start_index) >= int(text_anchors_check[0][\"start_index\"]) - 2\n", + " and int(token_anc.end_index) <= int(text_anchors_check[0][\"end_index\"]) + 2\n", + " and abs(int(token_anc.start_index) - int(token_anc.end_index)) > 2\n", + " ):\n", + " normalized_vertices = token.layout.bounding_poly\n", + " min_x, min_y, max_x, max_y = get_normalizedvertices(normalized_vertices)\n", + " temp_ver[\"x\"].extend([min_x, max_x])\n", + " temp_ver[\"y\"].extend([min_y, max_y])\n", + " text_anc_token = token.layout.text_anchor.text_segments\n", + " for an1 in text_anc_token:\n", + " temp_text_anc.append(an1)\n", + " confidence = token.layout.confidence\n", + " temp_confidence.append(confidence)\n", + "\n", + " if min_x == \"\":\n", + " for token in json_dict.pages[page].tokens:\n", + " if not token.layout.text_anchor.text_segments[0].start_index:\n", + " token.layout.text_anchor.text_segments[0].start_index = 0\n", + "\n", + " if (\n", + " abs(\n", + " int(token.layout.text_anchor.text_segments[0].start_index)\n", + " - int(text_anchors_check[0][\"start_index\"])\n", + " )\n", + " <= 2\n", + " and abs(\n", + " int(token.layout.text_anchor.text_segments[0].end_index)\n", + " - int(text_anchors_check[0][\"end_index\"])\n", + " )\n", + " <= 2\n", + " ):\n", + " normalized_vertices = token.layout.bounding_poly\n", + " min_x, min_y, max_x, max_y = get_normalizedvertices(normalized_vertices)\n", + " text_anc_token = token.layout.text_anchor.text_segments\n", + " confidence = token.layout.confidence\n", + " if len(temp_text_anc) != 0:\n", + " final_ver = {\n", + " \"min_x\": min(temp_ver[\"x\"]),\n", + " \"min_y\": min(temp_ver[\"y\"]),\n", + " \"max_x\": max(temp_ver[\"x\"]),\n", + " \"max_y\": max(temp_ver[\"y\"]),\n", + " }\n", + " final_confidence = min(temp_confidence)\n", + " final_text_anc = sorted(temp_text_anc, key=lambda x: x.end_index)\n", + " return final_ver, final_text_anc, final_confidence\n", + " else:\n", + " return (\n", + " {\"min_x\": min_x, \"min_y\": min_y, \"max_x\": max_x, \"max_y\": max_y},\n", + " text_anc_token,\n", + " confidence,\n", + " )\n", + "\n", + "\n", + "def synonym_entities(json_dict: object, Synonyms_entities: dict) -> object:\n", + " \"\"\"\n", + " Find synonym entities in the loaded JSON and add them to the entities list.\n", + "\n", + " Args:\n", + " json_dict (object): Loaded JSON dictionary.\n", + " Synonyms_entities (dict): Dictionary of synonym entities.\n", + " Returns:\n", + " object: Updated JSON dictionary with added entities.\n", + " \"\"\"\n", + "\n", + " def find_substring_indexes(text: str, substring: str) -> list:\n", + " \"\"\"\n", + " Find the start and end indices of all occurrences of a substring in the given text.\n", + "\n", + " Args:\n", + " text (str): The text to search in.\n", + " substring (str): The substring to find.\n", + "\n", + " Returns:\n", + " List: A list of tuples containing start and end indices of substring occurrences.\n", + " \"\"\"\n", + " import re\n", + "\n", + " if \" \" or \"\\n\" not in substring:\n", + " pattern = re.compile(re.escape(substring), re.IGNORECASE)\n", + " matches = [(match.start(), match.end()) for match in pattern.finditer(text)]\n", + " else:\n", + " pattern = re.compile(\n", + " r\"{}.*{}\".format(\n", + " re.escape(substring.split(\" \")[0]),\n", + " re.escape(substring.split(\" \")[-1]),\n", + " ),\n", + " re.IGNORECASE,\n", + " )\n", + " matches = [\n", + " (match.start(), match.end())\n", + " for match in pattern.finditer(json_dict.text)\n", + " ]\n", + "\n", + " return matches\n", + "\n", + " def create_ent(\n", + " ent_type: str, min_xy: dict, text_anc: list, page: str, confidence: float\n", + " ) -> dict:\n", + " \"\"\"\n", + " Create an entity dictionary.\n", + "\n", + " Args:\n", + " ent_type (str): The type of the entity.\n", + " min_xy (Dict[str, int]): Dictionary containing minimum x, y coordinates of the bounding box.\n", + " text_anc (List): List of text segments.\n", + " page (str): Page number.\n", + " confidence (float): Confidence score.\n", + "\n", + " Returns:\n", + " Dict: The created entity dictionary.\n", + " \"\"\"\n", + " final_mention_text = \"\"\n", + " for index1 in text_anc:\n", + " final_mention_text += json_dict.text[\n", + " int(index1.start_index) : int(index1.end_index)\n", + " ]\n", + " min_x = min_xy[\"min_x\"]\n", + " min_y = min_xy[\"min_y\"]\n", + " max_x = min_xy[\"max_x\"]\n", + " max_y = min_xy[\"max_y\"]\n", + " new_ent = {\n", + " \"confidence\": confidence,\n", + " \"mention_text\": final_mention_text,\n", + " \"page_anchor\": {\n", + " \"page_refs\": [\n", + " {\n", + " \"bounding_poly\": {\n", + " \"normalized_vertices\": [\n", + " {\"x\": min_x, \"y\": min_y},\n", + " {\"x\": min_x, \"y\": max_y},\n", + " {\"x\": max_x, \"y\": min_y},\n", + " {\"x\": max_x, \"y\": max_y},\n", + " ]\n", + " },\n", + " \"page\": page,\n", + " }\n", + " ]\n", + " },\n", + " \"text_anchor\": {\"text_segments\": text_anc},\n", + " \"type\": ent_type,\n", + " }\n", + " return new_ent\n", + "\n", + " new_entities = []\n", + " for key, value in Synonyms_entities.items():\n", + " for syn in value:\n", + " match_indexes = find_substring_indexes(json_dict.text, syn)\n", + " for match in match_indexes:\n", + " print(match)\n", + " if len(match) > 1:\n", + " for page in range(len(json_dict.pages)):\n", + " temp = json_dict.pages[page].layout.text_anchor.text_segments\n", + " if not temp[0].start_index:\n", + " temp[0].start_index = 0\n", + " if match[0] >= int(temp[0].start_index) and match[1] < int(\n", + " temp[0].end_index\n", + " ):\n", + " try:\n", + " min_xy, text_anc, confidence = get_token(\n", + " json_dict,\n", + " page,\n", + " [{\"start_index\": match[0], \"end_index\": match[1]}],\n", + " )\n", + " new_ent = create_ent(\n", + " key, min_xy, text_anc, page, confidence\n", + " )\n", + " new_entities.append(new_ent)\n", + " except Exception as e:\n", + " print(e)\n", + " if len(new_entities) > 0:\n", + " for ent1 in new_entities:\n", + " json_dict.entities.append(ent1)\n", + "\n", + " return json_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71679fb2-e09b-41cb-80e0-1ee9b9059ec6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "file_names_list, file_dict = utilities.file_names(gcs_input_path)\n", + "for filename, filepath in tqdm(file_dict.items(), desc=\"Progress\"):\n", + " print(\">>>>>>>>>>>>>>> Processing File : \", filename)\n", + " input_bucket_name = gcs_input_path.split(\"/\")[2]\n", + " if \".json\" in filepath:\n", + " json_dict = utilities.documentai_json_proto_downloader(\n", + " input_bucket_name, filepath\n", + " )\n", + " json_dict_updated = synonym_entities(json_dict, synonyms_entities)\n", + " output_bucket_name = gcs_output_path.split(\"/\")[2]\n", + " output_path_within_bucket = \"/\".join(gcs_output_path.split(\"/\")[3:]) + filename\n", + " utilities.store_document_as_json(\n", + " documentai.Document.to_json(json_dict_updated),\n", + " output_bucket_name,\n", + " output_path_within_bucket,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "8f4910c5-2270-49ea-a585-3ce3023e9038", + "metadata": {}, + "source": [ + "# 4.Output Details\n", + "\n", + "The output jsons files will be stored in the given output directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecc29ee0-7621-4ef1-9788-23c639a18fff", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m112", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m112" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}