-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataParser.py
103 lines (86 loc) · 3.86 KB
/
dataParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os, json, random
import numpy as np
from pathlib import Path
import sys
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
np.random.seed(19680801)
def getPoliticalOrientation(newspaper):
if ("The Australian" in newspaper) or ("The Times of India" in newspaper) or ("The Times" in newspaper):
return "right-center"
elif ("Sydney Morning Herald" in newspaper) or ("The Age" in newspaper) or ("The Hindu" in newspaper) or ("Mail & Guardian" in newspaper) or ("The Washington Post" in newspaper) or ("New York Times" in newspaper):
return "left-center"
else:
# randomly pick political orientation if it is unknown
# does not occur in this dataset
# pick accordingly to the distribution in the data
# 9148 / 23474 = 0.3897077
# results in 9106 (right-center) and 23516 (left-center)
if random.random() < 0.3897077:
return "right-center"
else:
return "left-center"
def parsePoliticalOrientation(articles):
result = np.array([])
for article in articles:
newArticle = article
newArticle['political_orientation'] = getPoliticalOrientation(article['newspaper'])
result = np.append(result, newArticle)
return result
def read_data():
files = Path("data").glob("**/*.json")
data = np.array([])
for file in files:
with open(file) as json_file:
rawJson = json.load(json_file)
data = np.append(data,[{
"metadata": {
'cop_edition': rawJson['cop_edition'],
'collection_start' : rawJson['collection_start'],
'collection_end' : rawJson['collection_end']
},
"articles": parsePoliticalOrientation(rawJson['articles'])
}])
return data
def read_articles(train_percentage=0.8, test_percentage=0.1, dev_percentage=0.1, randomise=True):
if train_percentage + test_percentage + dev_percentage != 1.0:
print('Split does not add to 1')
sys.exit(-1)
raw_data = read_data()
raw_articles = []
for cop_edition in raw_data:
raw_articles = np.append(raw_articles, cop_edition['articles'])
# remove articles with 'bad' headlines
articles = []
for article in raw_articles:
if "no headline in original" == article['headline'].lower():
continue
if "letters" == article['headline'].lower():
continue
if "letters to the editor" == article['headline'].lower():
continue
if "news summary" == article['headline'].lower():
continue
if "letters & emails" == article['headline'].lower():
continue
if "digest" == article['headline'].lower():
continue
articles = np.append(articles,article)
# split train/test
train, test = train_test_split(articles, train_size=train_percentage, test_size=1-train_percentage, shuffle=randomise, random_state=19680801)
# split test into dev/test
dev, test = train_test_split(articles, train_size=test_percentage, test_size=dev_percentage, shuffle=randomise, random_state=19680801)
return [[ article['headline'] for article in train ],
[ article['political_orientation'] for article in train ],
[ article['headline'] for article in dev ],
[ article['political_orientation'] for article in dev ],
[ article['headline'] for article in test ],
[ article['political_orientation'] for article in test ]]
def read_single(path):
with open(path) as json_file:
rawJson = json.load(json_file)
parsed = parsePoliticalOrientation(rawJson['articles'])
return [
[ article['headline'] for article in parsed ],
[ article['political_orientation'] for article in parsed ]
]