-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(albert): chunkage des données + recherche des informations (#16)
* fix: remote * feat(chunk): ajout de la partie chunkage (#18) * fix: chunk * fix: finish * fix: finish * fix: finish * fix: finish * fix: done * fix: format * config: Disable some pylint and mypy rules that are not necessarily useful * fix: retours * fix: retours * fix: retours * fix: retours * fix: config --------- Co-authored-by: Victor DEGLIAME <[email protected]>
- Loading branch information
Showing
18 changed files
with
626 additions
and
361 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,4 +101,4 @@ dmypy.json | |
local_dump/* | ||
|
||
# Data | ||
data/*.csv | ||
data/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import os | ||
from typing import Any, Dict | ||
|
||
import httpx | ||
|
||
from srdt_analysis.constants import ALBERT_ENDPOINT | ||
|
||
|
||
class AlbertBase: | ||
def __init__(self): | ||
self.api_key = os.getenv("ALBERT_API_KEY") | ||
if not self.api_key: | ||
raise ValueError( | ||
"API key must be provided either in constructor or as environment variable" | ||
) | ||
self.headers = {"Authorization": f"Bearer {self.api_key}"} | ||
|
||
def get_models(self) -> Dict[str, Any]: | ||
response = httpx.get(f"{ALBERT_ENDPOINT}/v1/models", headers=self.headers) | ||
return response.json() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from typing import List | ||
|
||
from langchain_text_splitters import ( | ||
MarkdownHeaderTextSplitter, | ||
RecursiveCharacterTextSplitter, | ||
) | ||
|
||
from srdt_analysis.constants import CHUNK_OVERLAP, CHUNK_SIZE | ||
from srdt_analysis.models import SplitDocument | ||
|
||
|
||
class Chunker: | ||
def __init__(self): | ||
self._markdown_splitter = MarkdownHeaderTextSplitter( | ||
[ | ||
("#", "Header 1"), | ||
("##", "Header 2"), | ||
("###", "Header 3"), | ||
("####", "Header 4"), | ||
("#####", "Header 5"), | ||
("######", "Header 6"), | ||
], | ||
strip_headers=False, | ||
) | ||
self._character_recursive_splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP | ||
) | ||
|
||
def split_markdown(self, markdown: str) -> List[SplitDocument]: | ||
md_header_splits = self._markdown_splitter.split_text(markdown) | ||
documents = self._character_recursive_splitter.split_documents(md_header_splits) | ||
return [SplitDocument(doc.page_content, doc.metadata) for doc in documents] | ||
|
||
def split_character_recursive(self, content: str) -> List[SplitDocument]: | ||
text_splits = self._character_recursive_splitter.split_text(content) | ||
return [SplitDocument(text, {}) for text in text_splits] | ||
|
||
def split(self, content: str, content_type: str = "markdown"): | ||
if content_type.lower() == "markdown": | ||
return self.split_markdown(content) | ||
raise ValueError(f"Unsupported content type: {content_type}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import json | ||
from io import BytesIO | ||
from typing import Any, Dict, List | ||
|
||
import httpx | ||
|
||
from srdt_analysis.albert import AlbertBase | ||
from srdt_analysis.constants import ALBERT_ENDPOINT | ||
from srdt_analysis.models import ChunkDataList, DocumentData | ||
|
||
|
||
class Collections(AlbertBase): | ||
def _create(self, collection_name: str, model: str) -> str: | ||
payload = {"name": collection_name, "model": model} | ||
response = httpx.post( | ||
f"{ALBERT_ENDPOINT}/v1/collections", headers=self.headers, json=payload | ||
) | ||
return response.json()["id"] | ||
|
||
def create(self, collection_name: str, model: str) -> str: | ||
collections: List[Dict[str, Any]] = self.list() | ||
for collection in collections: | ||
if collection["name"] == collection_name: | ||
self.delete(collection["id"]) | ||
return self._create(collection_name, model) | ||
|
||
def list(self) -> List[Dict[str, Any]]: | ||
response = httpx.get(f"{ALBERT_ENDPOINT}/v1/collections", headers=self.headers) | ||
return response.json()["data"] | ||
|
||
def delete(self, id_collection: str): | ||
response = httpx.delete( | ||
f"{ALBERT_ENDPOINT}/v1/collections/{id_collection}", headers=self.headers | ||
) | ||
response.raise_for_status() | ||
|
||
def delete_all(self, collection_name) -> None: | ||
collections = self.list() | ||
for collection in collections: | ||
if collection["name"] == collection_name: | ||
self.delete(collection["id"]) | ||
return None | ||
|
||
def search( | ||
self, | ||
prompt: str, | ||
id_collections: List[str], | ||
k: int = 5, | ||
score_threshold: float = 0, | ||
) -> ChunkDataList: | ||
response = httpx.post( | ||
f"{ALBERT_ENDPOINT}/v1/search", | ||
headers=self.headers, | ||
json={ | ||
"prompt": prompt, | ||
"collections": id_collections, | ||
"k": k, | ||
"score_threshold": score_threshold, | ||
}, | ||
) | ||
return response.json() | ||
|
||
def upload( | ||
self, | ||
data: List[DocumentData], | ||
id_collection: str, | ||
) -> None: | ||
result = [] | ||
for dt in data: | ||
dt: DocumentData | ||
chunks = dt["content_chunked"] | ||
for chunk in chunks: | ||
result.append( | ||
{ | ||
"text": chunk.page_content, | ||
"title": dt["title"], | ||
"metadata": { | ||
"cdtn_id": dt["cdtn_id"], | ||
"structure_du_chunk": chunk.metadata, | ||
"url": dt["url"], | ||
}, | ||
} | ||
) | ||
|
||
file_content = json.dumps(result).encode("utf-8") | ||
|
||
files = { | ||
"file": ( | ||
"content.json", | ||
BytesIO(file_content), | ||
"multipart/form-data", | ||
) | ||
} | ||
|
||
request_data = {"request": '{"collection": "%s"}' % id_collection} | ||
response = httpx.post( | ||
f"{ALBERT_ENDPOINT}/v1/files", | ||
headers=self.headers, | ||
files=files, | ||
data=request_data, | ||
) | ||
|
||
response.raise_for_status() | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
ALBERT_ENDPOINT = "https://albert.api.etalab.gouv.fr" | ||
MODEL_VECTORISATION = "BAAI/bge-m3" | ||
LLM_MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct" | ||
CHUNK_SIZE = 5000 | ||
CHUNK_OVERLAP = 500 | ||
BASE_URL_CDTN = "https://code.travail.gouv.fr" |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.