-
Notifications
You must be signed in to change notification settings - Fork 3
/
opinion_contextualization.py
109 lines (94 loc) · 4.5 KB
/
opinion_contextualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import argparse
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from efm import EFMReader
from model_reader import ModelReader
from util import substitute_word, to_one_hot
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, default='data/toy/test.csv',
help='Input file')
parser.add_argument('-p', '--preference_dir', type=str,
default='data/toy/efm',
help='Base model for getting preference score')
parser.add_argument('-m', '--model_path', type=str,
default='result/model.params',
help='ASC2V model file path')
parser.add_argument('-o', '--out', type=str,
default='contextualized.csv', help='Output path')
parser.add_argument('--debug', action='store_true',
help='Debug with smaller data')
return parser.parse_args()
class OpinionContextualizer():
def __init__(self, model_path, preference, strategy='', verbose=False):
self.model_reader = ModelReader(model_path, -1)
assert 'asc2v' in self.model_reader.params['model_type']
opinion2index = self.model_reader.opinion2index
self.word2index = self.model_reader.word2index
self.aspect2index = self.model_reader.aspect2index
self.aspect_opinions = self.model_reader.aspect_opinions
self.strategy = strategy
if 'aspect-opinion' not in self.strategy:
self.candidates = [self.word2index[w]
for w in opinion2index.keys()
if w in self.word2index and w != '<UNK>']
self.w_candidates = self.model_reader.w[self.candidates]
self.index2word = {v: k
for k, v in self.word2index.items()}
self.n_aspect = max(self.aspect2index.values()) + 1
self.preference = preference
self.verbose = verbose
if self.verbose:
print('Init OpinionContextualizer from %s' % model_path)
def get_ranked_opinions(self, user, item, sentence, aspect_position,
opinion_position, top_k=None):
sentence = sentence.split()
aspect_position = int(float(aspect_position))
opinion_position = int(float(opinion_position))
aspect = sentence[aspect_position]
if 'aspect-opinion' in self.strategy:
self.candidates = [self.word2index[w]
for w in self.aspect_opinions[aspect]
if w in self.word2index and w != '<UNK>']
self.w_candidates = self.model_reader.w[self.candidates]
aspect_index = self.aspect2index.get(
aspect, self.aspect2index['<UNK>'])
aspect1hot = to_one_hot(aspect_index, self.n_aspect)
score = self.preference.get_aspect_score(user, item, aspect)
sentence_v = [self.word2index.get(word, self.word2index['<UNK>'])
for word in sentence]
x = ([sentence_v], [opinion_position], [aspect1hot * score])
similarity = cosine_similarity(
self.model_reader.model.get_context_vector(x),
self.w_candidates).reshape(len(self.candidates))
ranked_ids = (-similarity).argsort()[0:top_k]
ranked_candidates = np.array(self.candidates).take(ranked_ids)
return [self.index2word[idx] for idx in ranked_candidates]
def contextualize(df, contextualizer, top_k=None, verbose=False):
if len(df) > 0:
df = df.copy()
df['top k opinions'] = df.apply(
lambda row: contextualizer.get_ranked_opinions(
row['reviewerID'],
row['asin'],
row['sentence'],
row['aspect_pos'],
row['opinion_pos'], top_k=top_k), axis=1)
df['predicted opinion'] = df['top k opinions'].apply(lambda x: x[0])
df['original sentence'] = df['sentence']
df['sentence'] = df.apply(
lambda row: substitute_word(
row['sentence'],
row['predicted opinion'],
row['opinion_pos']), axis=1)
return df
if __name__ == '__main__':
args = parse_arguments()
df = pd.read_csv(args.input)
if args.debug:
df = df[0:100]
preference = EFMReader(args.preference_dir)
contextualizer = OpinionContextualizer(args.model_path, preference)
df = contextualize(df, contextualizer)
df.to_csv(args.out, index=False)