-
Notifications
You must be signed in to change notification settings - Fork 0
/
contact_export.py
98 lines (79 loc) · 3.93 KB
/
contact_export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
import os
import re
def scrape_contact_info(index, website, project_name):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-webgl")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--log-level=3")
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_argument("user-agent=Mozilla/5.0")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(website)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# Scroll to ensure full content is loaded
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
# Extract emails and filter out image filenames
raw_emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', driver.page_source))
valid_emails = {email for email in raw_emails if not re.search(r'\d+x\d+|\.(png|jpg|jpeg|gif|bmp|svg)$', email)}
email = '; '.join(valid_emails) if valid_emails else None
# Extract social media links
social_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'facebook.com') or "
"contains(@href, 'twitter.com') or "
"contains(@href, 'instagram.com') or "
"contains(@href, 'linkedin.com') or "
"contains(@href, 'youtube.com')]")
social_sites = [link.get_attribute('href') for link in social_links]
socials_string = '; '.join(social_sites)
return {
"Project Name": project_name,
"Email": email,
"Socials": socials_string
}
except Exception as e:
return {
"Project Name": project_name,
"Email": None,
"Socials": '',
"Error": str(e)
}
finally:
driver.quit()
df = pd.read_csv('resources/mixed_data.csv')
# Ensure output directory exists
os.makedirs('resources', exist_ok=True)
# Filter out rows with missing Website or Name
valid_rows = df.dropna(subset=['Website', 'Name'])
# Output CSV setup
output_csv = 'resources/contact_info.csv'
# Open the CSV file in append mode and write the headers if file doesn't exist
if not os.path.exists(output_csv):
with open(output_csv, mode='w', newline='', encoding='utf-8') as f:
f.write("Project Name,Email,Socials\n")
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_website = {executor.submit(scrape_contact_info, index, row['Website'], row['Name']): row['Website']
for index, row in valid_rows.iterrows()}
with tqdm(total=len(future_to_website), desc="Scraping contact info",
bar_format="{l_bar}\033[95m{bar}\033[0m{r_bar}") as pbar:
for future in as_completed(future_to_website):
result = future.result()
# Write the result directly to the CSV file row-by-row
with open(output_csv, mode='a', newline='', encoding='utf-8') as f:
f.write(f"{result['Project Name']},{result['Email']},{result['Socials']}\n")
pbar.update(1)