From 7a21d0988ce62e386153041080eaf7473d301469 Mon Sep 17 00:00:00 2001 From: ZePan110 Date: Wed, 4 Sep 2024 16:22:46 +0800 Subject: [PATCH] Add compose file. (#612) --- .github/workflows/_comps-workflow.yml | 2 +- .../docker/compose/llms-compose-cd.yaml | 3 +- .../docker/compose/lvms-compose-cd.yaml | 3 +- .../compose/vectorstores-compose-cd.yaml | 9 ++++ comps/dataprep/utils.py | 49 +++++++++++++++++-- 5 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/docker/compose/vectorstores-compose-cd.yaml diff --git a/.github/workflows/_comps-workflow.yml b/.github/workflows/_comps-workflow.yml index cdcce9823..106ab2005 100644 --- a/.github/workflows/_comps-workflow.yml +++ b/.github/workflows/_comps-workflow.yml @@ -67,7 +67,7 @@ jobs: cd ${{ github.workspace }} if [[ $(grep -c "llava-tgi:" ${docker_compose_yml}) != 0 ]]; then - git clone https://github.com/yuanwu2017/tgi-gaudi.git && cd tgi-gaudi && git checkout v2.0.4 + git clone https://github.com/yuanwu2017/tgi-gaudi.git && cd tgi-gaudi && git checkout v2.0.4 && cd ../ fi if [[ $(grep -c "vllm-openvino:" ${docker_compose_yml}) != 0 ]]; then git clone https://github.com/vllm-project/vllm.git vllm-openvino diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml index 7ca537595..17c2f7949 100644 --- a/.github/workflows/docker/compose/llms-compose-cd.yaml +++ b/.github/workflows/docker/compose/llms-compose-cd.yaml @@ -8,5 +8,6 @@ services: image: ${REGISTRY:-opea}/llm-native:${TAG:-latest} vllm-openvino: build: - dockerfile: vllm-openvino/Dockerfile.openvino + context: vllm-openvino + dockerfile: Dockerfile.openvino image: ${REGISTRY:-opea}/vllm-openvino:${TAG:-latest} diff --git a/.github/workflows/docker/compose/lvms-compose-cd.yaml b/.github/workflows/docker/compose/lvms-compose-cd.yaml index 78fbb420d..679b8481a 100644 --- a/.github/workflows/docker/compose/lvms-compose-cd.yaml +++ b/.github/workflows/docker/compose/lvms-compose-cd.yaml @@ -23,5 +23,6 @@ services: image: ${REGISTRY:-opea}/lvm-tgi:${TAG:-latest} llava-tgi: build: - dockerfile: tgi-gaudi/Dockerfile + context: tgi-gaudi + dockerfile: Dockerfile image: ${REGISTRY:-opea}/llava-tgi:${TAG:-latest} diff --git a/.github/workflows/docker/compose/vectorstores-compose-cd.yaml b/.github/workflows/docker/compose/vectorstores-compose-cd.yaml new file mode 100644 index 000000000..00ac03d68 --- /dev/null +++ b/.github/workflows/docker/compose/vectorstores-compose-cd.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# this file should be run in the root of the repo +services: + vectorstore-pathway: + build: + dockerfile: comps/vectorstores/langchain/pathway + image: ${REGISTRY:-opea}/vectorstore-pathway:${TAG:-latest} diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index b300af428..d960977e3 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -41,6 +41,11 @@ from langchain_community.llms import HuggingFaceEndpoint from PIL import Image +from comps import CustomLogger + +logger = CustomLogger("prepare_doc_util") +logflag = os.getenv("LOGFLAG", False) + class TimeoutError(Exception): pass @@ -428,14 +433,51 @@ def fetch(self, url, headers=None, max_times=5): if not headers: headers = self.headers while max_times: - if not url.startswith("http") or not url.startswith("https"): + parsed_url = urlparse(url) + if not parsed_url.scheme: url = "http://" + url - print("start fetch %s...", url) + if logflag: + logger.info("start fetch %s..." % url) try: response = requests.get(url, headers=headers, verify=True) if response.status_code != 200: print("fail to fetch %s, response status code: %s", url, response.status_code) else: + # Extract charset from the Content-Type header + content_type = response.headers.get("Content-Type", "").lower() + if "charset=" in content_type: + # Extract charset value from the content-type header + charset = content_type.split("charset=")[-1].strip() + response.encoding = charset + if logflag: + logger.info(f"Charset detected and set: {response.encoding}") + else: + import re + + # Extract charset from the response HTML content + charset_from_meta = None + # Check for + match = re.search(r']+)["\']?', response.text, re.IGNORECASE) + if match: + charset_from_meta = match.group(1) + # Check for + if not charset_from_meta: + match = re.search( + r']+)["\']?', + response.text, + re.IGNORECASE, + ) + if match: + charset_from_meta = match.group(1) + if charset_from_meta: + response.encoding = charset_from_meta + if logflag: + logger.info(f"Charset detected and set from meta tag: {response.encoding}") + else: + # Fallback to default encoding + response.encoding = "utf-8" + if logflag: + logger.info("Charset not specified, using default utf-8") return response except Exception as e: print("fail to fetch %s, caused by %s", url, e) @@ -540,8 +582,9 @@ def load_html_data(url): main_content = all_text if main_content == "" else main_content main_content = main_content.replace("\n", "") main_content = main_content.replace("\n\n", "") - main_content = uni_pro(main_content) main_content = re.sub(r"\s+", " ", main_content) + if logflag: + logger.info("main_content=[%s]" % main_content) return main_content