-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
62 lines (41 loc) · 1.35 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from hot_topics import Clusters
from hot_topics.helpers import STOP_WORDS, clusterTokenizer
from flask import Flask
app = Flask(__name__)
def load_data():
df = pd.read_csv("articles.csv")
df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['content'] = df['title'] + ' ' + df['ingress'] + ' ' + df['body']
return df
df = load_data()
# Convert a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer(
use_idf=True,
tokenizer=clusterTokenizer,
stop_words=STOP_WORDS,
max_features=10000,
lowercase=True,
ngram_range=(1, 4)
)
X = vectorizer.fit_transform(df['content'].values)
# Use SVD to perform dimensionality reduction on the tf-idf vectors
lsa = make_pipeline(TruncatedSVD(n_components=300), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X)
# Generate clusters
topics = Clusters(df, X_lsa)
@app.route('/')
def index():
return topics.scatter_plot()
@app.route('/result')
def result():
return topics.to_JSON()
if __name__ == "__main__":
app.run()