generated from Marker-Inc-Korea/AutoRAG-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
56 lines (41 loc) · 1.71 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import ast
import os
from datetime import datetime
import pandas as pd
root_dir = os.path.dirname(os.path.realpath(__file__))
data_dir = os.path.join(root_dir, 'data')
def preprocess():
raw_df = pd.read_csv(os.path.join(data_dir, 'RAW_recipes.csv'))
raw_df = raw_df.dropna()
raw_df = raw_df[raw_df['minutes'] >= 1]
raw_df = raw_df[raw_df['n_steps'] >= 10]
raw_df = raw_df[raw_df['n_steps'] < 20]
raw_df = raw_df[raw_df['description'].str.len() > 20]
raw_df = raw_df.drop(columns=['contributor_id', 'nutrition'])
raw_df['tags'] = raw_df['tags'].apply(lambda x: ast.literal_eval(x))
raw_df['steps'] = raw_df['steps'].apply(lambda x: ast.literal_eval(x))
raw_df['ingredients'] = raw_df['ingredients'].apply(lambda x: ast.literal_eval(x))
result_df = raw_df.copy()
result_df['contents'] = result_df.apply(make_corpus, axis=1)
result_df['metadata'] = result_df['submitted'].apply(
lambda x: {'last_modified_datetime': datetime.strptime(x, '%Y-%m-%d')})
result_df = result_df[['id', 'contents', 'metadata']]
result_df = result_df.rename(columns={'id': 'doc_id'})
result_df['doc_id'] = result_df['doc_id'].apply(lambda x: str(x))
result_df.sample(1000, random_state=42).to_parquet(os.path.join(data_dir, 'corpus.parquet'), index=False)
def make_corpus(row):
step_str = '\n'.join([f"{i}. {val}" for i, val in enumerate(row['steps'])])
return f"""
# {row['name']} recipe
Estimated time: {row['minutes']} minutes
## Ingredients
{', '.join(row['ingredients'])}
## Steps
{step_str}
## Description
{row['description']}
## Tags
{', '.join(row['tags'])}
"""
if __name__ == '__main__':
preprocess()