Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ConceptNetNumberbatch word embeddings support #14

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
julia 0.7
DataDeps 0.5.1
AutoHashEquals
HDF5
6 changes: 4 additions & 2 deletions src/Embeddings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@ module Embeddings
using Statistics: norm
using DataDeps
using AutoHashEquals
using HDF5

export load_embeddings, language_files
export Word2Vec, GloVe, FastText_Text
export Word2Vec, GloVe, FastText_Text, ConceptNet

abstract type EmbeddingSystem{LANG} end
include("fasttext.jl")
include("glove.jl")
include("word2vec.jl")
include("conceptnet.jl")

function __init__()
for T in [Word2Vec, GloVe, FastText]
for T in [Word2Vec, GloVe, FastText, ConceptNet]
init(T)
end
end
Expand Down
127 changes: 127 additions & 0 deletions src/conceptnet.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
abstract type ConceptNet{LANG} <: EmbeddingSystem{LANG} end


function init(::Type{ConceptNet})
for (source, link, file, hashstring, post_fetch_method, lang) in [
("Multilingual",
"https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz",
"numberbatch-17.06.txt",
"375ac41e8f9caab172fc169e54e8658be23ece6cd92188bf700a99330cd1a81b",
DataDeps.unpack,
:multi),
("English",
"https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz",
"numberbatch-en-17.06.txt",
"72faf0a487c61b9a6a8c9ff0a1440d2f4936bb19102bddf27a833c2567620f2d",
DataDeps.unpack,
:en),
("Compressed",
"http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5",
"mini.h5",
"87f8b48fd01088b72013dfbcc3160f0d0e16942797959e0824b9b69f31a7b222",
identity,
:compressed)
]
register(DataDep("ConceptNet_$source",
"""
Pretrained ConceptNetNumberbatch Word emeddings
Website: https://github.com/commonsense/conceptnet-numberbatch
License: CC-SA 4.0
Author: Luminoso Technologies Inc.
Year: 2017

ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known as word embeddings) that can be used directly as a representation of word meanings or as a starting point for further machine learning.

Paper:
Robert Speer, Joshua Chin, and Catherine Havasi (2017). "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." In proceedings of AAAI 2017.
""",
link,
hashstring;
post_fetch_method=post_fetch_method
))

push!(language_files(ConceptNet{lang}), "ConceptNet_$(source)/$(file)")
end
end


function _load_embeddings(::Type{<:ConceptNet}, embedding_file, max_vocab_size, keep_words)
local LL, indexed_words, index
_loader = ifelse(any(endswith.(embedding_file, [".h5", ".hdf5"])),
_load_hdf5_embeddings,
_load_txt_embeddings)
LL, indexed_words = _loader(embedding_file, max_vocab_size, keep_words)
return LL, indexed_words
end


# Function that calculates how many embeddings to retreive
function _get_vocab_size(real_vocab_size, max_vocab_size)
real_vocab_size = max(0, real_vocab_size)
max_vocab_size = min(real_vocab_size, max_vocab_size)
return max_vocab_size
end


function _load_txt_embeddings(file::AbstractString, max_vocab_size, keep_words)
local LL, indexed_words
open(file, "r") do fid
vocab_size, vector_size = map(x->parse(Int,x), split(readline(fid)))
max_stored_vocab_size = _get_vocab_size(vocab_size, max_vocab_size)

indexed_words = Vector{String}(undef, max_stored_vocab_size)
LL = Array{Float64}(undef, vector_size, max_stored_vocab_size)

function _parseline(buf; word_only=false)
bufvec = split(buf, " ")
word = string(popfirst!(bufvec))
if word_only
return word, Float64[]
else
embedding = parse.(Float64, bufvec)
return word, embedding
end
end

cnt = 0
indices = Int[]
for (index, row) in enumerate(eachline(fid))
word, _ = _parseline(row, word_only=true)
if length(keep_words)==0 || word in keep_words
_, embedding = _parseline(row)
LL[:, index] = embedding
indexed_words[index] = word
push!(indices, index)
cnt+=1
if cnt > max_stored_vocab_size-1
break
end
end
end
LL = LL[:, indices]
indexed_words = indexed_words[indices]
end
return LL, indexed_words
end


# Loads the ConceptNetNumberbatch from a HDF5 file
function _load_hdf5_embeddings(file::AbstractString, max_vocab_size, keep_words)
payload = h5open(read, file)["mat"]
words = payload["axis1"]
vectors = payload["block0_values"]
max_stored_vocab_size = _get_vocab_size(length(words), max_vocab_size)

indices = Int[]
cnt = 0
for (index, word) in enumerate(words)
if length(keep_words)==0 || word in keep_words
push!(indices, index)
cnt+=1
if cnt > max_stored_vocab_size-1
break
end
end
end
return vectors[:, indices], words[indices]
end
50 changes: 50 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,53 @@ end
end

end

@testset "ConceptNet" begin
@testset_nokeep_data "Multilingual" begin
embs_full = load_embeddings(ConceptNet{:multi})

@test size(embs_full.embeddings) == (300, length(embs_full.vocab))

embs_mini = load_embeddings(ConceptNet{:multi}; max_vocab_size=100)
@test length(embs_mini.vocab)==100

@test embs_mini.embeddings == embs_full.embeddings[:, 1:100]
@test embs_mini.vocab == embs_full.vocab[1:100]
end

@testset_nokeep_data "English" begin
embs_full = load_embeddings(ConceptNet{:en})

@test size(embs_full.embeddings) == (300, length(embs_full.vocab))

embs_mini = load_embeddings(ConceptNet{:en}; max_vocab_size=100)
@test length(embs_mini.vocab)==100

@test embs_mini.embeddings == embs_full.embeddings[:, 1:100]
@test embs_mini.vocab == embs_full.vocab[1:100]

embs_specific = load_embeddings(ConceptNet{:en};
keep_words=Set(["red", "green", "blue"]))

@test size(embs_specific.embeddings) == (300, 3)
@test Set(embs_specific.vocab) == Set(["red", "green", "blue"])
end

@testset_nokeep_data "Compressed" begin
embs_full = load_embeddings(ConceptNet{:compressed})

@test size(embs_full.embeddings) == (300, length(embs_full.vocab))

embs_mini = load_embeddings(ConceptNet{:compressed}; max_vocab_size=100)
@test length(embs_mini.vocab)==100

@test embs_mini.embeddings == embs_full.embeddings[:, 1:100]
@test embs_mini.vocab == embs_full.vocab[1:100]

embs_specific = load_embeddings(ConceptNet{:compressed};
keep_words=Set(["/c/en/red", "/c/en/green", "/c/en/blue"]))

@test size(embs_specific.embeddings) == (300, 3)
@test Set(embs_specific.vocab) == Set(["/c/en/red", "/c/en/green", "/c/en/blue"])
end
end