From 5678a6c25f64d5bea3dc7504a6cd0334291578c7 Mon Sep 17 00:00:00 2001 From: Jacob Bryant Date: Thu, 31 Aug 2023 16:09:14 -0400 Subject: [PATCH] inital link suggestion work --- config.py | 8 ++ index/dapps.py | 106 +++++++++++++++++++++++++++ scrape/dapp_scraper.py | 162 +++++++++++++++++++++++++++++++++++++++++ tools/index_widget.py | 7 +- 4 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 index/dapps.py create mode 100644 scrape/dapp_scraper.py diff --git a/config.py b/config.py index 557b427e..f1e095e2 100644 --- a/config.py +++ b/config.py @@ -19,6 +19,14 @@ text_key="content", extra_keys=["url"], ) + +dapps_index = dict( + type="index.weaviate.WeaviateIndex", + index_name="ThirdPartyDapps", + text_key="description", + extra_keys=["url","name"], +) + api_docs_index = dict( type="index.weaviate.WeaviateIndex", index_name="APIDocsV1", diff --git a/index/dapps.py b/index/dapps.py new file mode 100644 index 00000000..99c51547 --- /dev/null +++ b/index/dapps.py @@ -0,0 +1,106 @@ +# to build the index for dapps, first scrap them from dapplist.com using the scraper +# then run: python -c "from index.dapps import backfill; backfill()" + + +from langchain.docstore.document import Document +from .weaviate import get_client +import json + +INDEX_NAME = "ThirdPartyDapps" +INDEX_DESCRIPTION = "Index of Third party dapps" +DAPP_DESCRIPTION = "description" +DAPP_NAME = "name" +DAPP_URL = "url" + +def delete_schema() -> None: + try: + client = get_client() + client.schema.delete_class(INDEX_NAME) + except Exception as e: + print(f"Error deleting schmea: {str(e)}") + +def create_schema(delete_first: bool = False) -> None: + try: + client = get_client() + if delete_first: + delete_schema() + client.schema.get() + schema = { + "classes": [ + { + "class": INDEX_NAME, + "description": INDEX_DESCRIPTION, + "vectorizer": "text2vec-openai", + "moduleConfig": { + "text2vec-openai": { + "model": "ada", + "modelVersion": "002", + "type": "text" + } + }, + "properties": [ + {"name": DAPP_NAME, "dataType": ["string"]}, + {"name": DAPP_DESCRIPTION, "dataType": ["string"]}, + {"name": DAPP_URL, "dataType": ["string"]}, + { + "name": "twitterHandle", + "dataType": ["string"], + "description": "The Twitter handle of the Dapp" + }, + { + "name": "blogLinks", + "dataType": ["string[]"], + "description": "Links to the blog posts related to the Dapp" + }, + { + "name": "discord", + "dataType": ["string"], + "description": "The Discord server link of the Dapp" + }, + { + "name": "facebook", + "dataType": ["string"], + "description": "The Facebook page link of the Dapp" + }, + { + "name": "instagram", + "dataType": ["string"], + "description": "The Instagram profile link of the Dapp" + }, + { + "name": "telegram", + "dataType": ["string"], + "description": "The Telegram channel link of the Dapp" + } + ] + } + + ] + } + client.schema.create(schema) + except Exception as e: + print(f"Error creating schema: {str(e)}") + +def backfill(): + try: + from langchain.vectorstores import Weaviate + + with open('./knowledge_base/dapp-list.json') as f: + dapp_list = json.load(f) + + # Extract the 'id' field from each dapp and store it in the 'documents' list + documents = [d.pop("name") for d in dapp_list] + + # Use the remaining fields in each dapp to populate the 'metadatas' list + # is this the best 'metadatas' to use? + metadatas = dapp_list + + create_schema(delete_first=True) + + client = get_client() + w = Weaviate(client, INDEX_NAME, DAPP_NAME) # is this a proper 3rd argument? + w.add_texts(documents, metadatas) + except Exception as e: + print(f"Error during backfill in dapps.py {str(e)}") + + diff --git a/scrape/dapp_scraper.py b/scrape/dapp_scraper.py new file mode 100644 index 00000000..85b32b4d --- /dev/null +++ b/scrape/dapp_scraper.py @@ -0,0 +1,162 @@ +import requests +import json +import os +from typing import List + +BROWSERLESS_API_KEY = os.getenv('BROWSERLESS_API_KEY', '') +SCRAPE_API_URL = f'https://chrome.browserless.io/scrape?token={BROWSERLESS_API_KEY}' + +# scrape a URL for IPFS links, return +def get_ipfs_links_from_url(url: str) -> List[str]: + + # specify what elements to return - in this case IFPS links + payload = json.dumps({ + "url": url, + "elements": [ + { + "selector": "a[href*='ipfs.io']", + }, + ], + }) + + # make the request + r = requests.post(SCRAPE_API_URL, headers={ + 'Cache-Control': 'no-cache', + 'Content-Type': 'application/json', + }, data=payload) + + # response text + response = r.text + + # Parse the JSON string into a dictionary + data = json.loads(response) + + # Access the items in the 'results' key + results = data['data'][0]['results'] + + # instantiate array to hold cleaned URLs + cleaned_ipfs_urls = [] + + # loop through response data, build array of just the IPFS links + for result in results: + href_value = None + for attribute in result["attributes"]: + if attribute["name"] == "href": + href_value = attribute["value"] + break + + if href_value: + cleaned_ipfs_urls.append(href_value) + + # return links arr + return cleaned_ipfs_urls + + +def scrape_ipfs_links(url: str = 'https://thedapplist.com/curate?status=All&q=') -> str: + + payload = json.dumps({ + "url": url, + "elements": [ + { + "selector": "a[href*='ipfs.io']", + }, + ], + }) + + r = requests.post(SCRAPE_API_URL, headers={ + 'Cache-Control': 'no-cache', + 'Content-Type': 'application/json', + }, data=payload) + + # Assuming the value from r.text is stored in the 'response' variable + response = r.text + + # Parse the JSON string into a dictionary + data = json.loads(response) + + # Access the items in the 'results' key from the browserless response + results = data['data'][0]['results'] + cleaned_ipfs_urls = [] + + for result in results: + href_value = None + for attribute in result["attributes"]: + if attribute["name"] == "href": + href_value = attribute["value"] + break + + if href_value: + cleaned_ipfs_urls.append(href_value) + + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'en-US,en;q=0.9', + 'Cache-Control': 'no-cache', + 'Pragma': 'no-cache', + 'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': '"macOS"', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'cross-site', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' + } + + responses = [] + + # here we take the scraped CIDs and pull info for each dapp + # from cloudflare's public IPFS gateway + with requests.Session() as session: + for url in cleaned_ipfs_urls: + CID = get_url_suffix(url) + IPFS_URL = f"https://cloudflare-ipfs.com/ipfs/{CID}" + try: + response = session.get(IPFS_URL, headers=headers, timeout=30) + if response.status_code == 200: + # process the response + responses.append(response.content) + pass + else: + print(f"Failed to retrieve {url}. Status code: {response.status_code}") + except requests.RequestException as e: + print(f"Error fetching {url}: {e}") + + # convert bytes objects to strings and load them as JSON + responses_str = [json.loads(response.decode()) for response in responses] + + # Save the responses array to a new json file called 'dapp-list.json' + with open('dapp-list.json', 'w') as f: + json.dump(clean_payload_data(responses_str), f, ensure_ascii=False) + + + +# a util function that, in this case, will get us the IPFS CID +def get_url_suffix(url: str) -> str: + return url.rsplit('/', 1)[-1] + +# Function to further clean the original JSON data by focusing only on the 'payload' property of 'msg' +def clean_payload_data(original_data): + # Extract and parse the 'msg' fields, then extract the 'payload' property + cleaned_payload_data = [json.loads(item.get('msg', '{}')).get('payload', {}) for item in original_data] + + # reduce each obj to just a few properties that we need + reduced_data = [] + for dapp in cleaned_payload_data: + cleaned_dapp = { + "name": dapp["name"], + "description": dapp["description"], + "url": dapp["url"], + "twitterHandle": dapp["twitterHandle"], + "blogLinks": dapp["blogLinks"], + "discord": dapp["socialLinks"]["discord"], + "facebook": dapp["socialLinks"]["facebook"], + "instagram": dapp["socialLinks"]["instagram"], + "telegram": dapp["socialLinks"]["telegram"] + } + reduced_data.append(cleaned_dapp) + + return reduced_data + + diff --git a/tools/index_widget.py b/tools/index_widget.py index d2bc86e6..7375a93d 100644 --- a/tools/index_widget.py +++ b/tools/index_widget.py @@ -355,13 +355,16 @@ def fn(token_handler): @error_wrap def fetch_scraped_sites(query: str) -> Callable: def fn(token_handler): - scraped_sites_index = config.initialize(config.scraped_sites_index) + # below is the old index used previously for scraped sites - do we still use this? + # if so, should we make a different function for the new dapps_index? + # scraped_sites_index = config.initialize(config.scraped_sites_index) + dapps_index = config.initialize(config.dapps_index) tool = dict( type="tools.index_answer.IndexAnswerTool", _streaming=True, name="ScrapedSitesIndexAnswer", content_description="", # not used - index=scraped_sites_index, + index=dapps_index, top_k=3, source_key="url", )