import re

__author__ = 'dnul'


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.preprocessing import normalize
import numpy as np


import glob

def representation(indices,dictionary,vector):
    for i in range(len(indices)):
        print(dictionary[indices[i]],vector[0,indices[i]])


def close_documents(distance_vector,corpus):
    closest = np.array(distance_vector).argsort()
    for value in closest[0:5]:
        print(corpus[value])


def max_n(row_data, row_indices, n):
        i = row_data.argsort()[-n:]
        # i = row_data.argpartition(-n)[-n:]
        top_values = row_data[i]
        top_indices = row_indices[i]  # do the sparse indices matter?
        return top_values, top_indices, i

def parse_double_utf8(txt):
    def parse(m):
        try:
            return m.group(0).encode('latin1').decode('utf8')
        except UnicodeDecodeError:
            return m.group(0)
    return re.sub(u'[\xc2-\xf4][\x80-\xbf]+', parse, txt)


corpus=[]
onlyfiles = glob.glob('./noticias/*.txt')
stopwords = parse_double_utf8(open('stopwords.txt','r').read()).splitlines()

for file in onlyfiles:
    content = open(file,'r').read()
    print(file)
    corpus=corpus+[content]

vectorizer = TfidfVectorizer(min_df=1,max_features=200,stop_words=stopwords)
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
#print(dict(zip(vectorizer.get_feature_names(), idf)))
#print(len(vectorizer.get_feature_names()))

print(idf)

#print(X[0])
#print(cosine_similarity(X[0],X[1]))
#print(euclidean_distances(X[0],X[1]))
#print(euclidean_distances(X[1],X[1]))
#print(cosine_similarity(X[1],X[1]))
distances = pairwise_distances(X,metric='cosine')
for i,row in enumerate(distances):
    print('-------\n')
    indices = np.array(X[i])
    print(i,corpus[i])
    arr_ll=X[i].tolil()
    top_values,top_indices,wtf = max_n(np.array(arr_ll.data[0]),np.array(arr_ll.rows[0]),10)
    #print('top values',top_values,'top indices',top_indices)
    representation(top_indices,vectorizer.get_feature_names(),X[i])
    close_documents(row,corpus)
    print('-------\n')
    #print(X[i])
    #print(row)
    #row[i]=0
    #print(max(row))
    #print(min(row))

#closest = pairwise_distances_argmin(X[5],X[6,:],metric='cosine')