From df58fe51f57006493e0da378dd36a2ae94aadd46 Mon Sep 17 00:00:00 2001 From: ZePan110 Date: Fri, 20 Sep 2024 10:51:32 +0800 Subject: [PATCH] Add hyperlinks and paths validation. (#132) Signed-off-by: ZePan110 --- .github/workflows/pr-path-detection.yml | 123 ++++++++++++++++++ README.md | 2 +- doc/platform-optimization/README.md | 2 +- evals/evaluation/autorag/evaluation/README.md | 4 +- evals/evaluation/rag_eval/README.md | 2 +- examples/AudioQnA/README.md | 48 +++++++ 6 files changed, 176 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/pr-path-detection.yml create mode 100644 examples/AudioQnA/README.md diff --git a/.github/workflows/pr-path-detection.yml b/.github/workflows/pr-path-detection.yml new file mode 100644 index 00000000..2bfb3969 --- /dev/null +++ b/.github/workflows/pr-path-detection.yml @@ -0,0 +1,123 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +name: Check Paths and Hyperlinks + +on: + pull_request: + branches: [main] + types: [opened, reopened, ready_for_review, synchronize] + +jobs: + check-the-validity-of-hyperlinks-in-README: + runs-on: ubuntu-latest + steps: + - name: Clean Up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Checkout Repo GenAIEval + uses: actions/checkout@v4 + + - name: Check the Validity of Hyperlinks + run: | + cd ${{github.workspace}} + fail="FALSE" + url_lines=$(grep -Eo '\]\(http[s]?://[^)]+\)' --include='*.md' -r .|grep -Ev 'GenAIEval/blob/main') + if [ -n "$url_lines" ]; then + for url_line in $url_lines; do + url=$(echo "$url_line"|cut -d '(' -f2 | cut -d ')' -f1|sed 's/\.git$//') + path=$(echo "$url_line"|cut -d':' -f1 | cut -d'/' -f2-) + response=$(curl -L -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Invalid link from ${{github.workspace}}/$path: $url" + fail="TRUE" + fi + fi + done + fi + + if [[ "$fail" == "TRUE" ]]; then + exit 1 + else + echo "All hyperlinks are valid." + fi + shell: bash + + check-the-validity-of-relative-path: + runs-on: ubuntu-latest + steps: + - name: Clean up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Checkout Repo GenAIEval + uses: actions/checkout@v4 + + - name: Checking Relative Path Validity + run: | + cd ${{github.workspace}} + fail="FALSE" + repo_name=${{ github.event.pull_request.head.repo.full_name }} + if [ "$(echo "$repo_name"|cut -d'/' -f1)" != "opea-project" ]; then + owner=$(echo "${{ github.event.pull_request.head.repo.full_name }}" |cut -d'/' -f1) + branch="https://github.com/$owner/GenAIEval/tree/${{ github.event.pull_request.head.ref }}" + else + branch="https://github.com/opea-project/GenAIEval/blob/${{ github.event.pull_request.head.ref }}" + fi + link_head="https://github.com/opea-project/GenAIEval/blob/main" + png_lines=$(grep -Eo '\]\([^)]+\)' --include='*.md' -r .|grep -Ev 'http') + if [ -n "$png_lines" ]; then + for png_line in $png_lines; do + refer_path=$(echo "$png_line"|cut -d':' -f1 | cut -d'/' -f2-) + png_path=$(echo "$png_line"|cut -d '(' -f2 | cut -d ')' -f1) + if [[ "${png_path:0:1}" == "/" ]]; then + check_path=${{github.workspace}}$png_path + elif [[ "${png_path:0:1}" == "#" ]]; then + check_path=${{github.workspace}}/$refer_path$png_path + else + check_path=${{github.workspace}}/$(dirname "$refer_path")/$png_path + fi + real_path=$(realpath $check_path) + if [ $? -ne 0 ]; then + echo "Path $png_path in file ${{github.workspace}}/$refer_path does not exist" + fail="TRUE" + else + url=$link_head$(echo "$real_path" | sed 's|.*/GenAIEval||') + response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Retry failed. Check branch ${{ github.event.pull_request.head.ref }}" + url_dev=$branch$(echo "$real_path" | sed 's|.*/GenAIEval||') + response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url_dev") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url_dev") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Invalid path from ${{github.workspace}}/$refer_path: $png_path" + fail="TRUE" + fi + else + echo "Check branch ${{ github.event.pull_request.head.ref }} successfully." + fi + fi + fi + fi + done + fi + + if [[ "$fail" == "TRUE" ]]; then + exit 1 + else + echo "All hyperlinks are valid." + fi + shell: bash diff --git a/README.md b/README.md index 8734f83a..3d6b6d6e 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ results = evaluate(args) #### remote service usage -1. setup a separate server with [GenAIComps](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/lm-eval) +1. setup a separate server with [GenAIComps](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/utils/lm-eval) ``` # build cpu docker diff --git a/doc/platform-optimization/README.md b/doc/platform-optimization/README.md index ae74765d..8b98a21c 100644 --- a/doc/platform-optimization/README.md +++ b/doc/platform-optimization/README.md @@ -98,7 +98,7 @@ Let us consider isolating AI inference and reranking containers in application's Gaudi accelerated pipeline. In the -[manifest](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/kubernetes/manifests/gaudi/chatqna.yaml) +[manifest](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml) there are "tgi", "tei" and "teirerank" containers in "chatqna-tgi" and "chatqna-tei" and "chatqna-teirerank" deployments that will need a lot of CPUs. They implement text-generation-interface and diff --git a/evals/evaluation/autorag/evaluation/README.md b/evals/evaluation/autorag/evaluation/README.md index 8068d58b..99a623d1 100644 --- a/evals/evaluation/autorag/evaluation/README.md +++ b/evals/evaluation/autorag/evaluation/README.md @@ -1,6 +1,6 @@ # AutoRAG to evaluate the RAG system performance -AutoRAG is help to end-to-end evaluate the performance of the whole system. Currently, we support to evaluate the performance from 4 perspectives, answer_relevancy, faithfulness, context_recall, context_precision. Before using this service, the use should firstly prepare the groundtruth dataset in the [standard format](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ground_truth.jsonl). We also provide a [script](https://github.com/opea-project/GenAIEval/blob/main/evals/evaluation/autorag/data_generation/gen_eval_dataset.py) to automatically generate the groundtruth query and answer. +AutoRAG is help to end-to-end evaluate the performance of the whole system. Currently, we support to evaluate the performance from 4 perspectives, answer_relevancy, faithfulness, context_recall, context_precision. Before using this service, the use should firstly prepare the groundtruth dataset in the [standard format](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ragas/ground_truth.jsonl). We also provide a [script](https://github.com/opea-project/GenAIEval/blob/main/evals/evaluation/autorag/data_generation/gen_eval_dataset.py) to automatically generate the groundtruth query and answer. ## Service preparation The evaluation for the RAG system is based on the set up of the RAG services. Please follow [the steps](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/README.md) to set up your RAG services. @@ -12,7 +12,7 @@ At this moment, we provide a solution that test the single group of parameters a python -u ragas_evaluation_benchmark.py --ground_truth_file ground_truth.jsonl --search_type mmr --k 1 --fetch_k 5 --score_threshold 0.3 --top_n 1 --temperature 0.01 --top_k 5 --top_p 0.95 --repetition_penalty 1.1 --use_openai_key True ``` -For evaluating multiple groups of parameters, please use [this script](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/run_rag_benchmark.py). +For evaluating multiple groups of parameters, please use [this script](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ragas/run_rag_benchmark.py). ```bash python -u run_rag_benchmark.py --config config.yaml ``` diff --git a/evals/evaluation/rag_eval/README.md b/evals/evaluation/rag_eval/README.md index 59f7dd2f..1186464a 100644 --- a/evals/evaluation/rag_eval/README.md +++ b/evals/evaluation/rag_eval/README.md @@ -7,7 +7,7 @@ - [Prerequisites](#prerequisites) - [MultiHop (English dataset)](#multihop) - [Launch Service of RAG System](#launch-service-of-rag-system) - - [Launch Service of LLM-as-a-Judge](launch-service-of-llm) + - [Launch Service of LLM-as-a-Judge](#launch-service-of-llm-as-a-judge) - [Prepare Dataset](#prepare-dataset) - [Evaluation](#evaluation) - [CRUD (Chinese dataset)](#crud) diff --git a/examples/AudioQnA/README.md b/examples/AudioQnA/README.md new file mode 100644 index 00000000..45290620 --- /dev/null +++ b/examples/AudioQnA/README.md @@ -0,0 +1,48 @@ +# AudioQnA accuracy Evaluation + +## Dataset + + +We evaluate the ASR accuracy on the test set of librispeech [dataset](https://huggingface.co/datasets/andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts. + +## Metrics + +We evaluate the WER (Word Error Rate) metric of the ASR microservice. + +## Evaluation + +### Launch ASR microservice + +Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr). + +```bash +git clone https://github.com/opea-project/GenAIComps +cd GenAIComps +docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . +# change the name of model by editing model_name_or_path you want to evaluate +docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny" +``` + +### Evaluate + +Install dependencies: + +``` +pip install -r requirements.txt +``` + +Evaluate the performance with the LLM: +```py +# validate the offline model +# python offline_evaluate.py +# validate the online asr microservice accuracy +python online_evaluate.py +``` + +### Performance Result +Here is the tested result for your reference +|| WER | +| --- | ---- | +|whisper-large-v2| 2.87| +|whisper-large| 2.7 | +|whisper-medium| 3.45 |