diff --git a/REQUIRE b/REQUIRE index 34cb1fc..4c638da 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,3 +1,4 @@ julia 0.7 DataDeps 0.5.1 AutoHashEquals +HDF5 diff --git a/src/Embeddings.jl b/src/Embeddings.jl index eca5846..dc8cf5f 100644 --- a/src/Embeddings.jl +++ b/src/Embeddings.jl @@ -3,17 +3,19 @@ module Embeddings using Statistics: norm using DataDeps using AutoHashEquals +using HDF5 export load_embeddings, language_files -export Word2Vec, GloVe, FastText_Text +export Word2Vec, GloVe, FastText_Text, ConceptNet abstract type EmbeddingSystem{LANG} end include("fasttext.jl") include("glove.jl") include("word2vec.jl") +include("conceptnet.jl") function __init__() - for T in [Word2Vec, GloVe, FastText] + for T in [Word2Vec, GloVe, FastText, ConceptNet] init(T) end end diff --git a/src/conceptnet.jl b/src/conceptnet.jl new file mode 100644 index 0000000..e45a33f --- /dev/null +++ b/src/conceptnet.jl @@ -0,0 +1,127 @@ +abstract type ConceptNet{LANG} <: EmbeddingSystem{LANG} end + + +function init(::Type{ConceptNet}) + for (source, link, file, hashstring, post_fetch_method, lang) in [ + ("Multilingual", + "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz", + "numberbatch-17.06.txt", + "375ac41e8f9caab172fc169e54e8658be23ece6cd92188bf700a99330cd1a81b", + DataDeps.unpack, + :multi), + ("English", + "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz", + "numberbatch-en-17.06.txt", + "72faf0a487c61b9a6a8c9ff0a1440d2f4936bb19102bddf27a833c2567620f2d", + DataDeps.unpack, + :en), + ("Compressed", + "http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5", + "mini.h5", + "87f8b48fd01088b72013dfbcc3160f0d0e16942797959e0824b9b69f31a7b222", + identity, + :compressed) + ] + register(DataDep("ConceptNet_$source", + """ + Pretrained ConceptNetNumberbatch Word emeddings + Website: https://github.com/commonsense/conceptnet-numberbatch + License: CC-SA 4.0 + Author: Luminoso Technologies Inc. + Year: 2017 + + ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known as word embeddings) that can be used directly as a representation of word meanings or as a starting point for further machine learning. + + Paper: + Robert Speer, Joshua Chin, and Catherine Havasi (2017). "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." In proceedings of AAAI 2017. + """, + link, + hashstring; + post_fetch_method=post_fetch_method + )) + + push!(language_files(ConceptNet{lang}), "ConceptNet_$(source)/$(file)") + end +end + + +function _load_embeddings(::Type{<:ConceptNet}, embedding_file, max_vocab_size, keep_words) + local LL, indexed_words, index + _loader = ifelse(any(endswith.(embedding_file, [".h5", ".hdf5"])), + _load_hdf5_embeddings, + _load_txt_embeddings) + LL, indexed_words = _loader(embedding_file, max_vocab_size, keep_words) + return LL, indexed_words +end + + +# Function that calculates how many embeddings to retreive +function _get_vocab_size(real_vocab_size, max_vocab_size) + real_vocab_size = max(0, real_vocab_size) + max_vocab_size = min(real_vocab_size, max_vocab_size) + return max_vocab_size +end + + +function _load_txt_embeddings(file::AbstractString, max_vocab_size, keep_words) + local LL, indexed_words + open(file, "r") do fid + vocab_size, vector_size = map(x->parse(Int,x), split(readline(fid))) + max_stored_vocab_size = _get_vocab_size(vocab_size, max_vocab_size) + + indexed_words = Vector{String}(undef, max_stored_vocab_size) + LL = Array{Float64}(undef, vector_size, max_stored_vocab_size) + + function _parseline(buf; word_only=false) + bufvec = split(buf, " ") + word = string(popfirst!(bufvec)) + if word_only + return word, Float64[] + else + embedding = parse.(Float64, bufvec) + return word, embedding + end + end + + cnt = 0 + indices = Int[] + for (index, row) in enumerate(eachline(fid)) + word, _ = _parseline(row, word_only=true) + if length(keep_words)==0 || word in keep_words + _, embedding = _parseline(row) + LL[:, index] = embedding + indexed_words[index] = word + push!(indices, index) + cnt+=1 + if cnt > max_stored_vocab_size-1 + break + end + end + end + LL = LL[:, indices] + indexed_words = indexed_words[indices] + end + return LL, indexed_words +end + + +# Loads the ConceptNetNumberbatch from a HDF5 file +function _load_hdf5_embeddings(file::AbstractString, max_vocab_size, keep_words) + payload = h5open(read, file)["mat"] + words = payload["axis1"] + vectors = payload["block0_values"] + max_stored_vocab_size = _get_vocab_size(length(words), max_vocab_size) + + indices = Int[] + cnt = 0 + for (index, word) in enumerate(words) + if length(keep_words)==0 || word in keep_words + push!(indices, index) + cnt+=1 + if cnt > max_stored_vocab_size-1 + break + end + end + end + return vectors[:, indices], words[indices] +end diff --git a/test/runtests.jl b/test/runtests.jl index 8ad221e..b7fa198 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -147,3 +147,53 @@ end end end + +@testset "ConceptNet" begin + @testset_nokeep_data "Multilingual" begin + embs_full = load_embeddings(ConceptNet{:multi}) + + @test size(embs_full.embeddings) == (300, length(embs_full.vocab)) + + embs_mini = load_embeddings(ConceptNet{:multi}; max_vocab_size=100) + @test length(embs_mini.vocab)==100 + + @test embs_mini.embeddings == embs_full.embeddings[:, 1:100] + @test embs_mini.vocab == embs_full.vocab[1:100] + end + + @testset_nokeep_data "English" begin + embs_full = load_embeddings(ConceptNet{:en}) + + @test size(embs_full.embeddings) == (300, length(embs_full.vocab)) + + embs_mini = load_embeddings(ConceptNet{:en}; max_vocab_size=100) + @test length(embs_mini.vocab)==100 + + @test embs_mini.embeddings == embs_full.embeddings[:, 1:100] + @test embs_mini.vocab == embs_full.vocab[1:100] + + embs_specific = load_embeddings(ConceptNet{:en}; + keep_words=Set(["red", "green", "blue"])) + + @test size(embs_specific.embeddings) == (300, 3) + @test Set(embs_specific.vocab) == Set(["red", "green", "blue"]) + end + + @testset_nokeep_data "Compressed" begin + embs_full = load_embeddings(ConceptNet{:compressed}) + + @test size(embs_full.embeddings) == (300, length(embs_full.vocab)) + + embs_mini = load_embeddings(ConceptNet{:compressed}; max_vocab_size=100) + @test length(embs_mini.vocab)==100 + + @test embs_mini.embeddings == embs_full.embeddings[:, 1:100] + @test embs_mini.vocab == embs_full.vocab[1:100] + + embs_specific = load_embeddings(ConceptNet{:compressed}; + keep_words=Set(["/c/en/red", "/c/en/green", "/c/en/blue"])) + + @test size(embs_specific.embeddings) == (300, 3) + @test Set(embs_specific.vocab) == Set(["/c/en/red", "/c/en/green", "/c/en/blue"]) + end +end