Skip to content

Commit

Permalink
chore: add support for TypeScript code splitting (#11160)
Browse files Browse the repository at this point in the history
- **Description:** Adds typescript language to `TextSplitter`

---------

Co-authored-by: Jacob Lee <[email protected]>
  • Loading branch information
fynnfluegge and jacoblee93 authored Sep 28, 2023
1 parent 17fcbed commit b738ccd
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ from langchain.text_splitter import (
'go',
'java',
'js',
'ts',
'php',
'proto',
'python',
Expand Down Expand Up @@ -107,6 +108,36 @@ js_docs

</CodeOutputBlock>

## TS
Here's an example using the TS text splitter:


```python
TS_CODE = """
function helloWorld(): void {
console.log("Hello, World!");
}
// Call the function
helloWorld();
"""

ts_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.TS, chunk_size=60, chunk_overlap=0
)
ts_docs = ts_splitter.create_documents([TS_CODE])
ts_docs
```

<CodeOutputBlock lang="python">

```
[Document(page_content='function helloWorld(): void {\n console.log("Hello, World!");\n}', metadata={}),
Document(page_content='// Call the function\nhelloWorld();', metadata={})]
```

</CodeOutputBlock>

## Markdown

Here's an example using the Markdown text splitter:
Expand Down
27 changes: 27 additions & 0 deletions libs/langchain/langchain/text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,7 @@ class Language(str, Enum):
GO = "go"
JAVA = "java"
JS = "js"
TS = "ts"
PHP = "php"
PROTO = "proto"
PYTHON = "python"
Expand Down Expand Up @@ -782,6 +783,32 @@ def get_separators_for_language(language: Language) -> List[str]:
" ",
"",
]
elif language == Language.TS:
return [
"\nenum ",
"\ninterface ",
"\nnamespace ",
"\ntype ",
# Split along class definitions
"\nclass ",
# Split along function definitions
"\nfunction ",
"\nconst ",
"\nlet ",
"\nvar ",
# Split along control flow statements
"\nif ",
"\nfor ",
"\nwhile ",
"\nswitch ",
"\ncase ",
"\ndefault ",
# Split by the normal type of lines
"\n\n",
"\n",
" ",
"",
]
elif language == Language.PHP:
return [
# Split along function definitions
Expand Down
27 changes: 27 additions & 0 deletions libs/langchain/tests/unit_tests/test_text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,33 @@ def test_javascript_code_splitter() -> None:
]


def test_typescript_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.TS, chunk_size=CHUNK_SIZE, chunk_overlap=0
)
code = """
function helloWorld(): void {
console.log("Hello, World!");
}
// Call the function
helloWorld();
"""
chunks = splitter.split_text(code)
assert chunks == [
"function",
"helloWorld():",
"void {",
'console.log("He',
"llo,",
'World!");',
"}",
"// Call the",
"function",
"helloWorld();",
]


def test_java_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0
Expand Down

0 comments on commit b738ccd

Please sign in to comment.