-
Notifications
You must be signed in to change notification settings - Fork 18
/
tfidf.m
49 lines (39 loc) · 1.22 KB
/
tfidf.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
function fea = tfidf(fea,bNorm)
% fea is a document-term frequency matrix, this function return the tfidf ([1+log(tf)]*log[N/df])
% weighted document-term matrix.
%
% If bNorm == 1, each document verctor will be further normalized to
% have unit norm. (default)
%
% version 2.0 --Jan/2012
% version 1.0 --Oct/2003
%
% Written by Deng Cai (dengcai AT gmail.com)
%
if ~exist('bNorm','var')
bNorm = 1;
end
[nSmp,mFea] = size(fea);
[idx,jdx,vv] = find(fea);
df = full(sum(sparse(idx,jdx,1),1));
df(df==0) = 1;
idf = log(nSmp./df);
tffea = sparse(idx,jdx,log(vv)+1);
fea2 = tffea';
idf = idf';
MAX_MATRIX_SIZE = 5000; % You can change this number based on your memory.
nBlock = ceil(MAX_MATRIX_SIZE*MAX_MATRIX_SIZE/mFea);
for i = 1:ceil(nSmp/nBlock)
if i == ceil(nSmp/nBlock)
smpIdx = (i-1)*nBlock+1:nSmp;
else
smpIdx = (i-1)*nBlock+1:i*nBlock;
end
fea2(:,smpIdx) = fea2(:,smpIdx) .* idf(:,ones(1,length(smpIdx)));
end
%Now each column of fea2 is the tf-idf vector.
%One can further normalize each vector to unit by using following codes:
if bNorm
fea = NormalizeFea(fea2,0)';
end
% fea is the final document-term matrix.