From 5678a6c25f64d5bea3dc7504a6cd0334291578c7 Mon Sep 17 00:00:00 2001
From: Jacob Bryant <jacobryas4@gmail.com>
Date: Thu, 31 Aug 2023 16:09:14 -0400
Subject: [PATCH] inital link suggestion work

---
 config.py              |   8 ++
 index/dapps.py         | 106 +++++++++++++++++++++++++++
 scrape/dapp_scraper.py | 162 +++++++++++++++++++++++++++++++++++++++++
 tools/index_widget.py  |   7 +-
 4 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 index/dapps.py
 create mode 100644 scrape/dapp_scraper.py

diff --git a/config.py b/config.py
index 557b427e..f1e095e2 100644
--- a/config.py
+++ b/config.py
@@ -19,6 +19,14 @@
     text_key="content",
     extra_keys=["url"],
 )
+
+dapps_index = dict(
+    type="index.weaviate.WeaviateIndex",
+    index_name="ThirdPartyDapps",
+    text_key="description",
+    extra_keys=["url","name"],
+)
+
 api_docs_index = dict(
     type="index.weaviate.WeaviateIndex",
     index_name="APIDocsV1",
diff --git a/index/dapps.py b/index/dapps.py
new file mode 100644
index 00000000..99c51547
--- /dev/null
+++ b/index/dapps.py
@@ -0,0 +1,106 @@
+# to build the index for dapps, first scrap them from dapplist.com using the scraper
+# then run: python -c "from index.dapps import backfill; backfill()"
+
+
+from langchain.docstore.document import Document
+from .weaviate import get_client
+import json
+
+INDEX_NAME = "ThirdPartyDapps"
+INDEX_DESCRIPTION = "Index of Third party dapps"
+DAPP_DESCRIPTION = "description"
+DAPP_NAME = "name"
+DAPP_URL = "url"
+
+def delete_schema() -> None:
+    try: 
+        client = get_client()
+        client.schema.delete_class(INDEX_NAME)
+    except Exception as e: 
+        print(f"Error deleting schmea: {str(e)}")
+
+def create_schema(delete_first: bool = False) -> None:
+    try: 
+        client = get_client()
+        if delete_first: 
+            delete_schema() 
+        client.schema.get()
+        schema = {
+            "classes": [
+                {
+                    "class": INDEX_NAME,
+                    "description": INDEX_DESCRIPTION,
+                    "vectorizer": "text2vec-openai",
+                    "moduleConfig": {
+                        "text2vec-openai": {
+                            "model": "ada",
+                            "modelVersion": "002",
+                            "type": "text"
+                        }
+                    },
+                    "properties": [
+                        {"name": DAPP_NAME, "dataType": ["string"]},
+                        {"name": DAPP_DESCRIPTION, "dataType": ["string"]},
+                        {"name": DAPP_URL, "dataType": ["string"]},
+                        {
+                            "name": "twitterHandle",
+                            "dataType": ["string"],
+                            "description": "The Twitter handle of the Dapp"
+                        },
+                        {
+                            "name": "blogLinks",
+                            "dataType": ["string[]"],
+                            "description": "Links to the blog posts related to the Dapp"
+                        },
+                        {
+                            "name": "discord",
+                            "dataType": ["string"],
+                            "description": "The Discord server link of the Dapp"
+                        },
+                        {
+                            "name": "facebook",
+                            "dataType": ["string"],
+                            "description": "The Facebook page link of the Dapp"
+                        },
+                        {
+                            "name": "instagram",
+                            "dataType": ["string"],
+                            "description": "The Instagram profile link of the Dapp"
+                        },
+                        {
+                            "name": "telegram",
+                            "dataType": ["string"],
+                            "description": "The Telegram channel link of the Dapp"
+                        }
+                    ]
+                }
+        
+            ]
+        }
+        client.schema.create(schema)
+    except Exception as e:
+        print(f"Error creating schema: {str(e)}")
+
+def backfill():
+    try: 
+        from langchain.vectorstores import Weaviate
+
+        with open('./knowledge_base/dapp-list.json') as f: 
+            dapp_list = json.load(f)
+            
+        # Extract the 'id' field from each dapp and store it in the 'documents' list
+        documents = [d.pop("name") for d in dapp_list]
+
+        # Use the remaining fields in each dapp to populate the 'metadatas' list
+        # is this the best 'metadatas' to use?
+        metadatas = dapp_list
+            
+        create_schema(delete_first=True)
+
+        client = get_client()
+        w = Weaviate(client, INDEX_NAME, DAPP_NAME) # is this a proper 3rd argument?
+        w.add_texts(documents, metadatas)
+    except Exception as e: 
+        print(f"Error during backfill in dapps.py {str(e)}")
+
+
diff --git a/scrape/dapp_scraper.py b/scrape/dapp_scraper.py
new file mode 100644
index 00000000..85b32b4d
--- /dev/null
+++ b/scrape/dapp_scraper.py
@@ -0,0 +1,162 @@
+import requests
+import json
+import os
+from typing import List
+
+BROWSERLESS_API_KEY = os.getenv('BROWSERLESS_API_KEY', '')
+SCRAPE_API_URL = f'https://chrome.browserless.io/scrape?token={BROWSERLESS_API_KEY}'
+
+# scrape a URL for IPFS links, return 
+def get_ipfs_links_from_url(url: str) -> List[str]:
+
+    # specify what elements to return - in this case IFPS links
+    payload = json.dumps({
+        "url": url,
+        "elements": [
+            {
+                "selector": "a[href*='ipfs.io']",
+            },
+        ],
+    })
+
+    # make the request
+    r = requests.post(SCRAPE_API_URL, headers={
+                'Cache-Control': 'no-cache',
+                'Content-Type': 'application/json',
+            }, data=payload)
+    
+    # response text
+    response = r.text
+
+    # Parse the JSON string into a dictionary
+    data = json.loads(response)
+
+    # Access the items in the 'results' key
+    results = data['data'][0]['results']
+
+    # instantiate array to hold cleaned URLs
+    cleaned_ipfs_urls = []
+
+    # loop through response data, build array of just the IPFS links
+    for result in results:
+        href_value = None
+        for attribute in result["attributes"]:
+            if attribute["name"] == "href":
+                href_value = attribute["value"]
+                break
+
+        if href_value: 
+            cleaned_ipfs_urls.append(href_value)
+
+    # return links arr
+    return cleaned_ipfs_urls
+
+
+def scrape_ipfs_links(url: str = 'https://thedapplist.com/curate?status=All&q=') -> str:
+
+    payload = json.dumps({
+        "url": url,
+        "elements": [
+            {
+                "selector": "a[href*='ipfs.io']",
+            },
+        ],
+    })
+
+    r = requests.post(SCRAPE_API_URL, headers={
+                'Cache-Control': 'no-cache',
+                'Content-Type': 'application/json',
+            }, data=payload)
+    
+    # Assuming the value from r.text is stored in the 'response' variable
+    response = r.text
+
+    # Parse the JSON string into a dictionary
+    data = json.loads(response)
+
+    # Access the items in the 'results' key from the browserless response
+    results = data['data'][0]['results']
+    cleaned_ipfs_urls = []
+
+    for result in results:
+        href_value = None
+        for attribute in result["attributes"]:
+            if attribute["name"] == "href":
+                href_value = attribute["value"]
+                break
+
+        if href_value: 
+            cleaned_ipfs_urls.append(href_value)
+
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Pragma': 'no-cache',
+        'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"macOS"',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    }
+
+    responses = []
+
+    # here we take the scraped CIDs and pull info for each dapp
+    # from cloudflare's public IPFS gateway
+    with requests.Session() as session:
+        for url in cleaned_ipfs_urls:
+            CID = get_url_suffix(url)
+            IPFS_URL = f"https://cloudflare-ipfs.com/ipfs/{CID}"
+            try:
+                response = session.get(IPFS_URL, headers=headers, timeout=30)
+                if response.status_code == 200:
+                    # process the response
+                    responses.append(response.content)
+                    pass
+                else:
+                    print(f"Failed to retrieve {url}. Status code: {response.status_code}")
+            except requests.RequestException as e:
+                print(f"Error fetching {url}: {e}")
+
+    # convert bytes objects to strings and load them as JSON
+    responses_str = [json.loads(response.decode()) for response in responses]
+
+    # Save the responses array to a new json file called 'dapp-list.json'
+    with open('dapp-list.json', 'w') as f:
+        json.dump(clean_payload_data(responses_str), f, ensure_ascii=False)
+
+
+
+# a util function that, in this case, will get us the IPFS CID
+def get_url_suffix(url: str) -> str:
+    return url.rsplit('/', 1)[-1]
+
+# Function to further clean the original JSON data by focusing only on the 'payload' property of 'msg'
+def clean_payload_data(original_data):
+    # Extract and parse the 'msg' fields, then extract the 'payload' property
+    cleaned_payload_data = [json.loads(item.get('msg', '{}')).get('payload', {}) for item in original_data]
+
+    # reduce each obj to just a few properties that we need
+    reduced_data = []
+    for dapp in cleaned_payload_data:
+        cleaned_dapp = {
+            "name": dapp["name"],
+            "description": dapp["description"],
+            "url": dapp["url"],
+            "twitterHandle": dapp["twitterHandle"],
+            "blogLinks": dapp["blogLinks"],
+            "discord": dapp["socialLinks"]["discord"],
+            "facebook":  dapp["socialLinks"]["facebook"],
+            "instagram": dapp["socialLinks"]["instagram"],
+            "telegram": dapp["socialLinks"]["telegram"]
+        }
+        reduced_data.append(cleaned_dapp)
+    
+    return reduced_data
+
+
diff --git a/tools/index_widget.py b/tools/index_widget.py
index d2bc86e6..7375a93d 100644
--- a/tools/index_widget.py
+++ b/tools/index_widget.py
@@ -355,13 +355,16 @@ def fn(token_handler):
 @error_wrap
 def fetch_scraped_sites(query: str) -> Callable:
     def fn(token_handler):
-        scraped_sites_index = config.initialize(config.scraped_sites_index)
+        # below is the old index used previously for scraped sites - do we still use this?
+        # if so, should we make a different function for the new dapps_index?
+        # scraped_sites_index = config.initialize(config.scraped_sites_index)
+        dapps_index = config.initialize(config.dapps_index)
         tool = dict(
             type="tools.index_answer.IndexAnswerTool",
             _streaming=True,
             name="ScrapedSitesIndexAnswer",
             content_description="",  # not used
-            index=scraped_sites_index,
+            index=dapps_index, 
             top_k=3,
             source_key="url",
         )