Skip to content

Commit

Permalink
Merge branch 'main' into reformat_notebooks_kao
Browse files Browse the repository at this point in the history
  • Loading branch information
zbyosufzai authored Jul 10, 2024
2 parents 90fe92d + ea6f571 commit ef498ee
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 44 deletions.
8 changes: 4 additions & 4 deletions notebooks/GenAI/AWS_Bedrock_Intro.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,10 @@
"import pandas as pd\n",
"import os\n",
"df = pd.read_csv('oa_comm.filelist.csv')\n",
"#first 100 files\n",
"first_100=df[0:100]\n",
"#first 50 files\n",
"first_50=df[0:50]\n",
"#save new metadata\n",
"first_100.to_csv('oa_comm.filelist_100.csv', index=False)"
"first_50.to_csv('oa_comm.filelist_100.csv', index=False)"
]
},
{
Expand All @@ -305,7 +305,7 @@
"source": [
"import os\n",
"#gather path to files in bucket\n",
"for i in first_100['Key']:\n",
"for i in first_50['Key']:\n",
" os.system(f'aws s3 cp s3://pmc-oa-opendata/{i} s3://{bucket}/docs/ --sse')"
]
},
Expand Down
117 changes: 113 additions & 4 deletions notebooks/GenAI/AWS_GenAI_Huggingface.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,14 @@
"train_dataset, test_dataset = load_dataset(\"ccdv/pubmed-summarization\", split=[\"train\", \"test\"])\n"
]
},
{
"cell_type": "markdown",
"id": "3399abb1-af8f-46ee-92ea-c8344eeddd09",
"metadata": {},
"source": [
"## Finetuning our Model Locally"
]
},
{
"cell_type": "markdown",
"id": "ed6ddff1-2636-4e3b-88ee-e3c86c584245",
Expand All @@ -210,9 +218,10 @@
"outputs": [],
"source": [
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
"model_name=\"google/flan-t5-small\"\n",
"\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")"
"model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
]
},
{
Expand Down Expand Up @@ -253,6 +262,106 @@
"test_dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"abstracts\"])"
]
},
{
"cell_type": "markdown",
"id": "b3ffd612-abde-4666-8c85-cc7069de2129",
"metadata": {},
"source": [
"The first step to training our model other than setting up our datasets is to set our **hyperparameters**. Hyperparameters depend on your training script and for this one we need to identify our model, the location of our train and test files, etc. iN this case we are using a one created by Hugging Face."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c06bef19-cc3c-476f-943c-78368e9f49e8",
"metadata": {},
"outputs": [],
"source": [
"from transformers import TrainingArguments\n",
"\n",
"training_args = TrainingArguments(output_dir=\"test_trainer\")"
]
},
{
"cell_type": "markdown",
"id": "cff31d69-9f54-4235-a377-7c5e758fbca8",
"metadata": {},
"source": [
"Next create setting to evaluate the models accuracy."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24bbe62e-9140-4bef-88ae-3e5029ddb25c",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import evaluate\n",
"\n",
"metric = evaluate.load(\"accuracy\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b82caeba-2daa-4526-b67d-04f45d4a9934",
"metadata": {},
"outputs": [],
"source": [
"def compute_metrics(eval_pred):\n",
" logits, labels = eval_pred\n",
" predictions = np.argmax(logits, axis=-1)\n",
" return metric.compute(predictions=predictions, references=labels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f5b50ec0-87b8-4578-96aa-e26bda9d99b8",
"metadata": {},
"outputs": [],
"source": [
"from transformers import TrainingArguments, Trainer\n",
"\n",
"training_args = TrainingArguments(output_dir=\"test_trainer\", evaluation_strategy=\"epoch\")"
]
},
{
"cell_type": "markdown",
"id": "df2225ac-8e92-4a14-a368-eebff9ead6bf",
"metadata": {},
"source": [
"Finally we can train our model!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e59332ae-c9e3-4a9b-9a7c-7020c87227da",
"metadata": {},
"outputs": [],
"source": [
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=test_dataset,\n",
" compute_metrics=compute_metrics,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f35520bb-b6ca-4996-b87e-2fbfdcfc0dff",
"metadata": {},
"outputs": [],
"source": [
"trainer.train()"
]
},
{
"cell_type": "markdown",
"id": "6ac841f6-c65e-4ebf-8c42-3030e2f92cb0",
Expand Down Expand Up @@ -342,7 +451,7 @@
"id": "9204b6dc-8f6e-407e-8c68-a036a6a5b7c9",
"metadata": {},
"source": [
"### Training our Model"
"### Training our ModelFinetuning our Model via Vertex AI Training API"
]
},
{
Expand Down Expand Up @@ -634,7 +743,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.13"
}
},
"nbformat": 4,
Expand Down
14 changes: 4 additions & 10 deletions notebooks/GenAI/AWS_GenAI_Jumpstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "6cf1429a-314e-49b6-a4f7-16a3e52319af",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"(\n",
" model_id,\n",
" model_version,\n",
") = (\n",
" \"meta-textgeneration-llama-2-7b-f\",\n",
" \"*\",\n",
")"
"model_id, model_version = \"meta-textgeneration-llama-2-13b-f\", \"2.*\""
]
},
{
Expand All @@ -110,7 +104,7 @@
"source": [
"from sagemaker.jumpstart.model import JumpStartModel\n",
"\n",
"model = JumpStartModel(model_id=model_id)\n",
"model = JumpStartModel(model_id=model_id, model_version=model_version)\n",
"predictor = model.deploy()\n"
]
},
Expand Down Expand Up @@ -254,7 +248,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.13"
}
},
"nbformat": 4,
Expand Down
35 changes: 15 additions & 20 deletions notebooks/GenAI/Pubmed_RAG_chatbot.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,7 @@
},
"outputs": [],
"source": [
"(\n",
" model_id,\n",
" model_version,\n",
") = (\n",
" \"meta-textgeneration-llama-2-7b-f\",\n",
" \"*\",\n",
")"
"model_id, model_version = \"meta-textgeneration-llama-2-13b-f\", \"2.*\""
]
},
{
Expand All @@ -105,8 +99,8 @@
"source": [
"from sagemaker.jumpstart.model import JumpStartModel\n",
"\n",
"model = JumpStartModel(model_id=model_id)\n",
"predictor = model.deploy()"
"model = JumpStartModel(model_id=model_id, model_version=model_version)\n",
"predictor = model.deploy()\n"
]
},
{
Expand Down Expand Up @@ -225,7 +219,7 @@
"id": "93a8595a-767f-4cad-9273-62d8e2cf60d1",
"metadata": {},
"source": [
"We only want the metadata of the first 100 files to keep this tutorial short."
"We only want the metadata of the first 50 files to keep this tutorial short."
]
},
{
Expand All @@ -242,19 +236,20 @@
"import os\n",
"\n",
"df = pd.read_csv('oa_comm.filelist.csv')\n",

"\n",
"#first 100 files\n",
"first_100=df[0:101]\n",
"#first 50 files\n",
"first_50=df[0:50]\n",
"#save new metadata\n",
"first_100.to_csv('oa_comm.filelist_100.csv', index=False)"
"first_50.to_csv('oa_comm.filelist_50.csv', index=False)"
]
},
{
"cell_type": "markdown",
"id": "abd1ae93-450e-4c79-83cc-ea46a1b507c1",
"metadata": {},
"source": [
"Lets look at our metadata! We can see that the bucket path to the files are under the **Key** column. This column is what we will use to loop through the PMC bucket and copy the first 100 files to our bucket."
"Lets look at our metadata! We can see that the bucket path to the files are under the **Key** column. This column is what we will use to loop through the PMC bucket and copy the first 50 files to our bucket."
]
},
{
Expand All @@ -264,7 +259,7 @@
"metadata": {},
"outputs": [],
"source": [
"first_100"
"first_50"
]
},
{
Expand All @@ -276,7 +271,7 @@
"source": [
"import os\n",
"#gather path to files in bucket\n",
"for i in first_100['Key']:\n",
"for i in first_50['Key']:\n",
" os.system(f'aws s3 cp s3://pmc-oa-opendata/{i} s3://{bucket}/docs/ --sse')"
]
},
Expand All @@ -295,7 +290,7 @@
"metadata": {},
"outputs": [],
"source": [
"! aws s3 cp oa_comm.filelist_100.csv s3://{bucket}/docs/"
"! aws s3 cp oa_comm.filelist_50.csv s3://{bucket}/docs/"
]
},
{
Expand Down Expand Up @@ -373,12 +368,12 @@
},
"source": [
"```python\n",
"from langchain.retrievers import PubMedRetriever\n",
"from langchain_community.retrievers import PubMedRetriever\n",
"from langchain.retrievers import AmazonKendraRetriever\n",
"from langchain.llms import SagemakerEndpoint\n",
"from langchain_community.llms import SagemakerEndpoint\n",
"from langchain_community.llms.sagemaker_endpoint import LLMContentHandler\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.llms.sagemaker_endpoint import LLMContentHandler\n",
"import sys\n",
"import json\n",
"import os\n",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from langchain.retrievers import PubMedRetriever
from langchain_community.retrievers import PubMedRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
#from langchain import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from langchain_community.llms import SagemakerEndpoint
from langchain_community.llms.sagemaker_endpoint import LLMContentHandler
import sys
import json
import os
from langchain.llms import SagemakerEndpoint


class bcolors:
Expand All @@ -24,7 +23,6 @@ class bcolors:

def build_chain():
region = os.environ["AWS_REGION"]
#kendra_index_id = os.environ["KENDRA_INDEX_ID"]
endpoint_name = os.environ["LLAMA_2_ENDPOINT"]

class ContentHandler(LLMContentHandler):
Expand Down Expand Up @@ -58,7 +56,6 @@ def transform_output(self, output: bytes) -> str:
content_handler=content_handler,
)

#retriever = AmazonKendraRetriever(index_id=kendra_index_id,region_name=region)
retriever= PubMedRetriever()

prompt_template = """
Expand Down

0 comments on commit ef498ee

Please sign in to comment.