This repository has been archived by the owner on May 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.py
158 lines (121 loc) · 4.94 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
import tiktoken
import time
import os
import csv
import openai
from dotenv import load_dotenv
import numpy as np
load_dotenv()
# models
GPT_MODEL = "gpt-3.5-turbo"
def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
"""Return the number of tokens in a string."""
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
def split_text_chunks(text: str, max_tokens: int):
"""Split a text into chunks of specified maximum tokens."""
chunks = []
current_chunk = ""
for word in text.split():
current_chunk += word + " "
if num_tokens(current_chunk) >= max_tokens:
chunks.append(current_chunk.strip())
current_chunk = ""
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
assert OPENAI_API_KEY, "OPENAI_API_KEY environment variable is missing from .env"
openai.api_key = OPENAI_API_KEY
# Load the list of URLs
with open('./additional.txt', 'r') as file:
urls = [line.strip() for line in file.readlines()]
# Set up the driver options
options = webdriver.ChromeOptions()
options.add_argument("--incognito")
options.add_argument("--disable-site-isolation-trials")
options.add_argument("--headless")
# Create a new instance of the Chrome driver
driver = webdriver.Chrome(options=options, service=Service(ChromeDriverManager().install()))
# Maximum tokens allowed for the model
max_tokens = 8190
embedding_file_name = 'visionos_docs_2023_07_10_embedding.csv'
text_file_name = 'visionos_docs_2023_07_10_text.csv'
# Open the CSV file for saving embedding
with open(embedding_file_name, 'w', newline='') as csvfile:
# Create a CSV writer
writer = csv.writer(csvfile)
# Write column names to the CSV file
writer.writerow(['id', 'embedding'])
# Open the CSV file for saving text
with open(text_file_name, 'w', newline='') as csvfile:
# Create a CSV writer
writer = csv.writer(csvfile)
# Write column names to the CSV file
writer.writerow(['id', 'text'])
# For each URL
for idx, url in enumerate(urls):
# Navigate to the page
driver.get(url)
# Wait for the page to load
time.sleep(5)
# Get the HTML of the page
html_source = driver.page_source
# print(html_source)
# Remove 'nav' tags
driver.execute_script("var elems = document.getElementsByTagName('nav'); for (var i = 0; i < elems.length; i++) { elems[i].style.display = 'none'; }")
# Remove all content within 'footer' tags
driver.execute_script("var footers = document.getElementsByTagName('footer'); for (var i = 0; i < footers.length; i++) { while (footers[i].firstChild) { footers[i].removeChild(footers[i].firstChild); } }")
# Initialize an empty string to hold all the text
text = ""
# Find all the h1, h2, h3, h4, p, ul, and code tags and append their text to the string
for tag_name in ['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'code']:
elements = driver.find_elements(By.TAG_NAME, tag_name)
# For each element, append its text to the string immediately
for element in elements:
try:
text += element.text + " "
except StaleElementReferenceException:
continue
text_chunks = split_text_chunks(text, max_tokens)
print("chunks: ", len(text_chunks))
print(text_chunks)
embeddings = []
# Iterate over text chunks and retrieve embeddings
for chunk in text_chunks:
response = openai.Embedding.create(
input=chunk,
model="text-embedding-ada-002"
)
embedding = response['data'][0]['embedding']
embeddings.append(embedding)
embedding = np.array(embeddings).mean(axis=0).tolist()
print("embedding: ", len(embedding))
# print(embedding)
print(f"Total tokens in the text: {num_tokens(text)}")
# Now you can use the 'text' string with the OpenAI API
# print(text)
# Open the CSV file for saving embedding
with open(embedding_file_name, 'a', newline='') as csvfile:
# Create a CSV writer
writer = csv.writer(csvfile)
# Write embedding vector to the CSV file
writer.writerow([idx, ",".join(map(str, embedding))])
# Open the CSV file for saving text
with open(text_file_name, 'a', newline='') as csvfile:
# Create a CSV writer
writer = csv.writer(csvfile)
# Write text to the CSV file
writer.writerow([idx, text])
# Close the driver
driver.quit()