From 31904283dd0c5bb8a8da4d5a4ad558ad0dd36ead Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 26 Mar 2024 16:10:42 -0400 Subject: [PATCH 01/10] log exceptions --- docs/scripts/check_imports.py | 84 +++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 docs/scripts/check_imports.py diff --git a/docs/scripts/check_imports.py b/docs/scripts/check_imports.py new file mode 100644 index 0000000000000..2a6599aaf699e --- /dev/null +++ b/docs/scripts/check_imports.py @@ -0,0 +1,84 @@ +import importlib +import json +import logging +import os +from pathlib import Path +import re +from typing import List, Tuple + +logger = logging.getLogger(__name__) + +DOCS_DIR = Path(os.path.abspath(__file__)).parents[1] / "docs" +import_pattern = re.compile( + r"import\s+(\w+)|from\s+([\w\.]+)\s+import\s+((?:\w+(?:,\s*)?)+|\(.*?\))", + re.DOTALL +) + +def _get_imports_from_code_cell(code_lines: str) -> List[Tuple[str, str]]: + """Get (module, import) statements from a single code cell.""" + import_statements = [] + for line in code_lines: + line = line.strip() + if line.startswith("#") or not line: + continue + # Join lines that end with a backslash + if line.endswith("\\"): + line = line[:-1].rstrip() + " " + continue + matches = import_pattern.findall(line) + for match in matches: + if match[0]: # simple import statement + import_statements.append((match[0], '')) + else: # from ___ import statement + module, items = match[1], match[2] + items_list = items.replace(' ', '').split(',') + for item in items_list: + import_statements.append((module, item)) + return import_statements + + +def _extract_import_statements(notebook_path: str) -> List[Tuple[str, str]]: + """Get (module, import) statements from a Jupyter notebook.""" + with open(notebook_path, "r", encoding="utf-8") as file: + notebook = json.load(file) + code_cells = [cell for cell in notebook["cells"] if cell["cell_type"] == "code"] + import_statements = [] + for cell in code_cells: + code_lines = cell["source"] + import_statements.extend(_get_imports_from_code_cell(code_lines)) + return import_statements + + +def _check_import_statements(import_statements: List[Tuple[str, str]]) -> None: + for module, item in import_statements: + try: + if item: + try: + # submodule + full_module_name = f"{module}.{item}" + importlib.import_module(full_module_name) + except ModuleNotFoundError: + # attribute + imported_module = importlib.import_module(module) + getattr(imported_module, item) + else: + importlib.import_module(module) + except Exception as e: + logger.error(f"Failed to resolve '{item}' in module '{module}'. Error: {e}") + + +def check_notebooks(directory: str) -> None: + for root, _, files in os.walk(directory): + for file in files: + if file.endswith(".ipynb"): + notebook_path = os.path.join(root, file) + import_statements = [ + (module, item) + for module, item in _extract_import_statements(notebook_path) + if "langchain" in module + ] + _check_import_statements(import_statements) + + +if __name__ == "__main__": + check_notebooks(DOCS_DIR) From 986d83d1831c44b5ca2743b27dc6f31a6998766d Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 26 Mar 2024 16:31:20 -0400 Subject: [PATCH 02/10] subset to recognized packages --- docs/scripts/check_imports.py | 64 ++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/docs/scripts/check_imports.py b/docs/scripts/check_imports.py index 2a6599aaf699e..3f27f8d6f9c2d 100644 --- a/docs/scripts/check_imports.py +++ b/docs/scripts/check_imports.py @@ -5,6 +5,7 @@ from pathlib import Path import re from typing import List, Tuple +import warnings logger = logging.getLogger(__name__) @@ -49,7 +50,9 @@ def _extract_import_statements(notebook_path: str) -> List[Tuple[str, str]]: return import_statements -def _check_import_statements(import_statements: List[Tuple[str, str]]) -> None: +def _get_bad_imports(import_statements: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + """Collect offending import statements.""" + offending_imports = [] for module, item in import_statements: try: if item: @@ -59,15 +62,47 @@ def _check_import_statements(import_statements: List[Tuple[str, str]]) -> None: importlib.import_module(full_module_name) except ModuleNotFoundError: # attribute - imported_module = importlib.import_module(module) - getattr(imported_module, item) + try: + imported_module = importlib.import_module(module) + getattr(imported_module, item) + except Exception as e: + offending_imports.append((module, item)) + except Exception as e: + offending_imports.append((module, item)) else: importlib.import_module(module) except Exception as e: - logger.error(f"Failed to resolve '{item}' in module '{module}'. Error: {e}") + offending_imports.append((module, item)) + return offending_imports -def check_notebooks(directory: str) -> None: + +def _is_relevant_import(module: str) -> bool: + """Check if module is recognized.""" + # Ignore things like langchain_{bla}, where bla is unrecognized. + recognized_packages = [ + "langchain", + "langchain_core", + "langchain_community", + "langchain_experimental", + "langchain_text_splitters", + ] + return module.split(".")[0] in recognized_packages + + +def _serialize_bad_imports(bad_files: list) -> str: + """Serialize bad imports to a string.""" + bad_imports_str = "" + for file, bad_imports in bad_files: + bad_imports_str += f"File: {file}\n" + for module, item in bad_imports: + bad_imports_str += f" {module}.{item}\n" + return bad_imports_str + + +def check_notebooks(directory: str) -> list: + """Check notebooks for broken import statements.""" + bad_files = [] for root, _, files in os.walk(directory): for file in files: if file.endswith(".ipynb"): @@ -75,10 +110,23 @@ def check_notebooks(directory: str) -> None: import_statements = [ (module, item) for module, item in _extract_import_statements(notebook_path) - if "langchain" in module + if _is_relevant_import(module) ] - _check_import_statements(import_statements) + bad_imports = _get_bad_imports(import_statements) + if bad_imports: + bad_files.append( + ( + file, + bad_imports, + ) + ) + return bad_files if __name__ == "__main__": - check_notebooks(DOCS_DIR) + bad_files = check_notebooks(DOCS_DIR) + if bad_files: + warnings.warn( + "Found bad imports:\n" + f"{_serialize_bad_imports(bad_files)}" + ) From bc6c7d4dabefff572034fda2025f8ced79998944 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 26 Mar 2024 16:32:17 -0400 Subject: [PATCH 03/10] format --- docs/scripts/check_imports.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/docs/scripts/check_imports.py b/docs/scripts/check_imports.py index 3f27f8d6f9c2d..8366207f130e6 100644 --- a/docs/scripts/check_imports.py +++ b/docs/scripts/check_imports.py @@ -2,19 +2,19 @@ import json import logging import os -from pathlib import Path import re -from typing import List, Tuple import warnings +from pathlib import Path +from typing import List, Tuple logger = logging.getLogger(__name__) DOCS_DIR = Path(os.path.abspath(__file__)).parents[1] / "docs" import_pattern = re.compile( - r"import\s+(\w+)|from\s+([\w\.]+)\s+import\s+((?:\w+(?:,\s*)?)+|\(.*?\))", - re.DOTALL + r"import\s+(\w+)|from\s+([\w\.]+)\s+import\s+((?:\w+(?:,\s*)?)+|\(.*?\))", re.DOTALL ) + def _get_imports_from_code_cell(code_lines: str) -> List[Tuple[str, str]]: """Get (module, import) statements from a single code cell.""" import_statements = [] @@ -29,10 +29,10 @@ def _get_imports_from_code_cell(code_lines: str) -> List[Tuple[str, str]]: matches = import_pattern.findall(line) for match in matches: if match[0]: # simple import statement - import_statements.append((match[0], '')) + import_statements.append((match[0], "")) else: # from ___ import statement module, items = match[1], match[2] - items_list = items.replace(' ', '').split(',') + items_list = items.replace(" ", "").split(",") for item in items_list: import_statements.append((module, item)) return import_statements @@ -126,7 +126,4 @@ def check_notebooks(directory: str) -> list: if __name__ == "__main__": bad_files = check_notebooks(DOCS_DIR) if bad_files: - warnings.warn( - "Found bad imports:\n" - f"{_serialize_bad_imports(bad_files)}" - ) + warnings.warn("Found bad imports:\n" f"{_serialize_bad_imports(bad_files)}") From 41947bb38c5ca42aa98c901aeabcb140ded0a630 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 28 Mar 2024 15:41:48 -0400 Subject: [PATCH 04/10] add workflow job --- .github/workflows/_test_doc_imports.yml | 50 +++++++++++++++++++++++++ .github/workflows/check_diffs.yml | 6 +++ 2 files changed, 56 insertions(+) create mode 100644 .github/workflows/_test_doc_imports.yml diff --git a/.github/workflows/_test_doc_imports.yml b/.github/workflows/_test_doc_imports.yml new file mode 100644 index 0000000000000..e179198fae709 --- /dev/null +++ b/.github/workflows/_test_doc_imports.yml @@ -0,0 +1,50 @@ +name: test + +env: + POETRY_VERSION: "1.7.1" + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + name: "check doc imports #${{ matrix.python-version }}" + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} + uses: "./.github/actions/poetry_setup" + with: + python-version: ${{ matrix.python-version }} + poetry-version: ${{ env.POETRY_VERSION }} + cache-key: core + + - name: Install dependencies + shell: bash + run: poetry install --with test + + - name: Install langchain editable + run: | + poetry run pip install -e libs/langchain libs/community libs/experimental + + - name: Check doc imports + shell: bash + run: | + python docs/scripts/check_imports.py + + - name: Ensure the test did not create any additional files + shell: bash + run: | + set -eu + + STATUS="$(git status)" + echo "$STATUS" + + # grep will exit non-zero if the target message isn't found, + # and `set -e` above will cause the step to fail. + echo "$STATUS" | grep 'nothing to commit, working tree clean' diff --git a/.github/workflows/check_diffs.yml b/.github/workflows/check_diffs.yml index c4bd8b448b826..764cbf7c98ea4 100644 --- a/.github/workflows/check_diffs.yml +++ b/.github/workflows/check_diffs.yml @@ -60,6 +60,12 @@ jobs: working-directory: ${{ matrix.working-directory }} secrets: inherit + test_doc_imports: + needs: [ build ] + if: ${{ needs.build.outputs.dirs-to-test != '[]' }} + uses: ./.github/workflows/_test_doc_imports.yml + secrets: inherit + compile-integration-tests: name: cd ${{ matrix.working-directory }} needs: [ build ] From 5f129d2cb409b27984c4019744ca28532b5ab380 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 28 Mar 2024 17:08:20 -0400 Subject: [PATCH 05/10] restrict python versions --- .github/workflows/_test_doc_imports.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/_test_doc_imports.yml b/.github/workflows/_test_doc_imports.yml index e179198fae709..47f85f930ba14 100644 --- a/.github/workflows/_test_doc_imports.yml +++ b/.github/workflows/_test_doc_imports.yml @@ -9,9 +9,6 @@ jobs: strategy: matrix: python-version: - - "3.8" - - "3.9" - - "3.10" - "3.11" name: "check doc imports #${{ matrix.python-version }}" steps: From 63b20784c59be3f870a273321d490d2c7f77c362 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Fri, 29 Mar 2024 11:23:29 -0400 Subject: [PATCH 06/10] cr --- .github/workflows/_test_doc_imports.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_test_doc_imports.yml b/.github/workflows/_test_doc_imports.yml index 47f85f930ba14..b61c41b89145b 100644 --- a/.github/workflows/_test_doc_imports.yml +++ b/.github/workflows/_test_doc_imports.yml @@ -1,4 +1,7 @@ -name: test +name: test_doc_imports + +on: + workflow_call: env: POETRY_VERSION: "1.7.1" @@ -27,12 +30,12 @@ jobs: - name: Install langchain editable run: | - poetry run pip install -e libs/langchain libs/community libs/experimental + poetry run pip install -e libs/core libs/langchain libs/community libs/experimental - name: Check doc imports shell: bash run: | - python docs/scripts/check_imports.py + poetry run python docs/scripts/check_imports.py - name: Ensure the test did not create any additional files shell: bash From 0d5598c792580eb98b6a04d0a0de73a9aff5cb33 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Fri, 29 Mar 2024 11:34:13 -0400 Subject: [PATCH 07/10] add module docstring --- docs/scripts/check_imports.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/scripts/check_imports.py b/docs/scripts/check_imports.py index 8366207f130e6..a29bedf1e15f1 100644 --- a/docs/scripts/check_imports.py +++ b/docs/scripts/check_imports.py @@ -1,3 +1,4 @@ +"""This script checks documentation for broken import statements.""" import importlib import json import logging From 5d316e927ef2ac62ac52d83d1c7d4e02b15f690e Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Fri, 29 Mar 2024 11:45:39 -0400 Subject: [PATCH 08/10] cr --- docs/scripts/check_imports.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/scripts/check_imports.py b/docs/scripts/check_imports.py index a29bedf1e15f1..c416932432d1e 100644 --- a/docs/scripts/check_imports.py +++ b/docs/scripts/check_imports.py @@ -66,13 +66,13 @@ def _get_bad_imports(import_statements: List[Tuple[str, str]]) -> List[Tuple[str try: imported_module = importlib.import_module(module) getattr(imported_module, item) - except Exception as e: + except AttributeError: offending_imports.append((module, item)) - except Exception as e: + except Exception: offending_imports.append((module, item)) else: importlib.import_module(module) - except Exception as e: + except Exception: offending_imports.append((module, item)) return offending_imports @@ -106,7 +106,7 @@ def check_notebooks(directory: str) -> list: bad_files = [] for root, _, files in os.walk(directory): for file in files: - if file.endswith(".ipynb"): + if file.endswith(".ipynb") and not file.endswith("-checkpoint.ipynb"): notebook_path = os.path.join(root, file) import_statements = [ (module, item) @@ -117,7 +117,7 @@ def check_notebooks(directory: str) -> list: if bad_imports: bad_files.append( ( - file, + os.path.join(root, file), bad_imports, ) ) @@ -127,4 +127,4 @@ def check_notebooks(directory: str) -> list: if __name__ == "__main__": bad_files = check_notebooks(DOCS_DIR) if bad_files: - warnings.warn("Found bad imports:\n" f"{_serialize_bad_imports(bad_files)}") + raise ImportError("Found bad imports:\n" f"{_serialize_bad_imports(bad_files)}") From 063b521f63ee92f6ee260dee70ccb51669a35681 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Fri, 29 Mar 2024 11:56:08 -0400 Subject: [PATCH 09/10] introduce error to check CI --- docs/docs/expression_language/get_started.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/expression_language/get_started.ipynb b/docs/docs/expression_language/get_started.ipynb index a7c03adb5835c..67f2576b3c09e 100644 --- a/docs/docs/expression_language/get_started.ipynb +++ b/docs/docs/expression_language/get_started.ipynb @@ -58,7 +58,7 @@ } ], "source": [ - "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.output_parsers import StrOutputParse\n", "from langchain_core.prompts import ChatPromptTemplate\n", "from langchain_openai import ChatOpenAI\n", "\n", From 3097a1c06afbbb968ec219cecc179805d05766a6 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Fri, 29 Mar 2024 12:08:00 -0400 Subject: [PATCH 10/10] Revert "introduce error to check CI" This reverts commit 063b521f63ee92f6ee260dee70ccb51669a35681. --- docs/docs/expression_language/get_started.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/expression_language/get_started.ipynb b/docs/docs/expression_language/get_started.ipynb index 67f2576b3c09e..a7c03adb5835c 100644 --- a/docs/docs/expression_language/get_started.ipynb +++ b/docs/docs/expression_language/get_started.ipynb @@ -58,7 +58,7 @@ } ], "source": [ - "from langchain_core.output_parsers import StrOutputParse\n", + "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.prompts import ChatPromptTemplate\n", "from langchain_openai import ChatOpenAI\n", "\n",