-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhoneymaker.py
executable file
·161 lines (135 loc) · 5.37 KB
/
honeymaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from faker import Faker
import json
import random
import os
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import argparse
def generate_fake_data(
num_contact_records: int = 50, num_paper_dirs: int = 10, language: str = None
):
"""
Generate subdirectory structure containing folders with random amounts of fake research papers,
data, and contacts in respective language. Saves honey into "generated" folder within the working
directory.
"""
faker_obj = None
papers_dir = "papers"
contacts_file = "contacts"
data_filename = "research_data"
name_label = "Name"
courses_label = "Courses"
if language == "russian":
faker_obj = Faker("ru_RU")
papers_dir = "документы"
contacts_file = "контакты"
name_label = "имя"
data_filename = "данные"
courses_label = "Курсы"
elif language == "spanish":
faker_obj = Faker("es_ES")
papers_dir = "documentos"
contacts_file = "contactos"
data_filename = "datos"
name_label = "Nombre"
courses_label = "Cursos"
elif language == "chinese":
faker_obj = Faker("zh_CN")
papers_dir = "文件"
contacts_file = "联系人"
data_filename = "数据"
name_label = "姓名"
courses_label = "培训班"
else:
faker_obj = Faker()
save_dir = f"/home/student/generated/{language}/"
create_folder_structure(save_dir + papers_dir)
generate_contacts(
faker_obj,
save_dir + contacts_file + ".json",
num_contact_records,
name_label,
courses_label,
)
# Generate fake papers and data
for folder_num in range(1, num_paper_dirs):
papers_folder = os.path.join(save_dir + papers_dir, f"{folder_num}")
for paper_num in range(1, random.randint(2, 10)):
pdf_file = os.path.join(papers_folder, f"{papers_dir}_{paper_num}.pdf")
c = canvas.Canvas(pdf_file, pagesize=letter)
c.setFont("Helvetica", 12)
x = 20
y = 750
line_height = 14
fake_text = "\n".join(faker_obj.paragraphs(nb=150))
# Split text into lines and manage placement
lines = fake_text.split("\n")
for line in lines:
c.drawString(x, y, line)
y -= line_height
c.save()
for data_num in range(1, random.randint(2, 10)):
data_file = os.path.join(papers_folder, f"{data_filename}_{data_num}.txt")
# Generate fake research results data
research_results = {
"Title": faker_obj.sentence(),
"Author": faker_obj.name(),
"Abstract": faker_obj.text(max_nb_chars=200),
"Data": faker_obj.text(max_nb_chars=1000),
}
# Write the research results to a text file
with open(data_file, "w") as data:
data.write(json.dumps(research_results, indent=4))
print("Fake papers and data generated and saved as PDFs.")
def create_folder_structure(papers_dir):
# Create the main papers directory
if not os.path.exists(papers_dir):
os.makedirs(papers_dir)
# Create numbered subfolders
for folder_num in range(1, 10):
folder_path = os.path.join(papers_dir, f"{folder_num}")
if not os.path.exists(folder_path):
os.makedirs(folder_path)
def generate_contacts(
faker_obj, contacts_output_file, num_contact_records, name_label, courses_label
):
contacts = []
for _ in range(num_contact_records):
uid = "".join(random.choices("ABCDEFGHIJKLMNOPQRSTUVWXYZ", k=8))
name = faker_obj.name()
email = faker_obj.email()
# Choose 3-6 random courses
all_courses = ["HACS", "CMSC", "MATH", "BMGT", "AMST", "ENES", "BIOE", "STAT",
"BUFN", "CHEM", "PHYS", "HACS", "INST", "MSML", "HESI", "RUSS"]
courses = random.sample(all_courses, random.randint(3, 6))
for i in range(len(courses)):
courses[i] += str(random.randint(6, 8))
courses[i] += str(random.randint(0, 9))
courses[i] += str(random.randint(0, 9))
contact = {"UID": uid, name_label: name, "Email": email, courses_label: courses}
contacts.append(contact)
# Save the generated data to a JSON file
with open(contacts_output_file, "w") as contacts_file:
json.dump(contacts, contacts_file, indent=4)
print("Contacts data saved to", contacts_output_file)
def generate_all_data(contacts, papers):
for language in ["english", "russian", "chinese", "spanish"]:
generate_fake_data(contacts, papers, language)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate contacts & research data.")
parser.add_argument(
"--contacts", type=int, default=50, help="Number of contact records to generate"
)
parser.add_argument(
"--papers", type=int, default=10, help="Number of paper directories to create"
)
parser.add_argument(
"--language",
type=str,
choices=["english", "spanish", "chinese", "russian"],
default="english",
help="Language of honey",
)
args = parser.parse_args()
# generate_fake_data(args.contacts, args.papers, args.language)
generate_all_data(args.contacts, args.papers)