From 5a40305cbd66b113d71f0dc6c132205c2f6e167a Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Mon, 17 Sep 2018 17:17:18 +0200 Subject: [PATCH 1/5] Added ConceptNetNumberbatch initial support --- src/Embeddings.jl | 6 ++- src/conceptnet.jl | 119 ++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 50 +++++++++++++++++++ 3 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 src/conceptnet.jl diff --git a/src/Embeddings.jl b/src/Embeddings.jl index eca5846..dc8cf5f 100644 --- a/src/Embeddings.jl +++ b/src/Embeddings.jl @@ -3,17 +3,19 @@ module Embeddings using Statistics: norm using DataDeps using AutoHashEquals +using HDF5 export load_embeddings, language_files -export Word2Vec, GloVe, FastText_Text +export Word2Vec, GloVe, FastText_Text, ConceptNet abstract type EmbeddingSystem{LANG} end include("fasttext.jl") include("glove.jl") include("word2vec.jl") +include("conceptnet.jl") function __init__() - for T in [Word2Vec, GloVe, FastText] + for T in [Word2Vec, GloVe, FastText, ConceptNet] init(T) end end diff --git a/src/conceptnet.jl b/src/conceptnet.jl new file mode 100644 index 0000000..39728c5 --- /dev/null +++ b/src/conceptnet.jl @@ -0,0 +1,119 @@ +abstract type ConceptNet{LANG} <: EmbeddingSystem{LANG} end + + +function init(::Type{ConceptNet}) + for (source, link, hashstring, post_fetch_method, lang) in [ + ("Multilingual", + "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz", + "375ac41e8f9caab172fc169e54e8658be23ece6cd92188bf700a99330cd1a81b", + DataDeps.unpack, + :multi), + ("English", + "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz", + "72faf0a487c61b9a6a8c9ff0a1440d2f4936bb19102bddf27a833c2567620f2d", + DataDeps.unpack, + :en), + ("Compressed", + "http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5", + "87f8b48fd01088b72013dfbcc3160f0d0e16942797959e0824b9b69f31a7b222", + identity, + :compressed) + ] + register(DataDep("ConceptNet_$source", + """ + Pretrained ConceptNetNumberbatch Word emeddings + Website: https://github.com/commonsense/conceptnet-numberbatch + License: CC-SA 4.0 + Author: Luminoso Technologies Inc. + Year: 2017 + + ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known as word embeddings) that can be used directly as a representation of word meanings or as a starting point for further machine learning. + + Paper: + Robert Speer, Joshua Chin, and Catherine Havasi (2017). "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." In proceedings of AAAI 2017. + """, + link, + hashstring; + post_fetch_method=post_fetch_method + )) + + push!(language_files(ConceptNet{lang}), "ConceptNet_$(source)/$(join(split(link, "/")[end]))") + end +end + + +function _load_embeddings(::Type{<:ConceptNet}, embedding_file, max_vocab_size, keep_words) + local LL, indexed_words, index + if any(endswith.(file, [".h5", ".hdf5"])) + LL, indexed_words = _load_hdf5_embeddings(embedding_file, + max_vocab_size=max_vocab_size, + keep_words=keep_words) + else + LL, indexed_words = _load_txt_embeddings(embedding_file, + max_vocab_size=max_vocab_size, + keep_words=keep_words) + end + return LL, indexed_words +end + + +# Function that calculates how many embeddings to retreive +function _get_vocab_size(real_vocab_size, max_vocab_size) + real_vocab_size = max(0, real_vocab_size) + max_vocab_size = min(real_vocab_size, max_vocab_size) + return max_vocab_size +end + + +function _load_txt_embeddings(file::AbstractString, max_vocab_size, keep_words) + open(file, "r") do fid + vocab_size, vector_size = map(x->parse(Int,x), split(readline(fid))) + max_stored_vocab_size = _get_vocab_size(vocab_size, max_vocab_size) + data = readlines(fid) + + indexed_words = Vector{String}(undef, max_stored_vocab_size) + LL = Array{Float32}(undef, vector_size, max_stored_vocab_size) + _parseline = (buf)-> begin + bufvec = split(buf, " ") + word = string(popfirst!(bufvec)) + embedding = parse.(Float64, bufvec) + #embedding = map(x->parse(Float64,x), bufvec) + return word, embedding + end + + cnt = 0 + for (index, row) in enumerate(data) + word, embedding = _parseline(row) + if length(keep_words)==0 || word in keep_words + LL[:, index] = embedding + idexed_words[index] = word + cnt+=1 + if cnt > max_stored_vocab_size + break + end + end + end + end + return LL, indexed_words +end + + +# Loads the ConceptNetNumberbatch from a HDF5 file +function _load_hdf5_embeddings(file::AbstractString, max_vocab_size, keep_words) + payload = h5open(read, file)["mat"] + words = payload["axis1"] + vectors = payload["block0_values"] + max_vocab_size = _get_vocab_size(length(words), max_vocab_size) + + indices = Int[] + for (index, word) in enumerate(words) + if length(keep_words)==0 || word in keep_words + push!(indices, index) + cnt+=1 + if cnt > max_stored_vocab_size + break + end + end + end + return vectors[:, indices], words[indices] +end diff --git a/test/runtests.jl b/test/runtests.jl index 8ad221e..917a8c1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -147,3 +147,53 @@ end end end + +@testset "ConceptNet" begin + @testset_nokeep_data "Multilingual" begin + embs_full = load_embeddings(ConceptNet{:multi}) + + @test size(embs_full.embeddings) == (300, length(embs_full.vocab)) + + embs_mini = load_embeddings(ConceptNet{:multi}; max_vocab_size=100) + @test length(embs_mini.vocab)==100 + + @test embs_mini.embeddings == embs_full.embeddings[:, 1:100] + @test embs_mini.vocab == embs_full.vocab[1:100] + end + + @testset_nokeep_data "English" begin + embs_full = load_embeddings(ConceptNet{:en}) + + @test size(embs_full.embeddings) == (300, length(embs_full.vocab)) + + embs_mini = load_embeddings(ConceptNet{:en}; max_vocab_size=100) + @test length(embs_mini.vocab)==100 + + @test embs_mini.embeddings == embs_full.embeddings[:, 1:100] + @test embs_mini.vocab == embs_full.vocab[1:100] + + embs_specific = load_embeddings(ConceptNet{:en}; + keep_words=Set(["red", "green", "blue"])) + + @test size(embs_specific.embeddings) == (300, 3) + @test Set(embs_specific.vocab) == Set(["red", "green", "blue"]) + end + + @testset_nokeep_data "Compressed" begin + embs_full = load_embeddings(ConceptNet{:compressed}) + + @test size(embs_full.embeddings) == (300, length(embs_full.vocab)) + + embs_mini = load_embeddings(ConceptNet{:compressed}; max_vocab_size=100) + @test length(embs_mini.vocab)==100 + + @test embs_mini.embeddings == embs_full.embeddings[:, 1:100] + @test embs_mini.vocab == embs_full.vocab[1:100] + + embs_specific = load_embeddings(ConceptNet{:compressed}; + keep_words=Set(["red", "green", "blue"])) + + @test size(embs_specific.embeddings) == (300, 3) + @test Set(embs_specific.vocab) == Set(["red", "green", "blue"]) + end +end From 1c56957181dc2e3726dfbbf6ba26486d3868006c Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Mon, 17 Sep 2018 17:19:13 +0200 Subject: [PATCH 2/5] Added HDF5 to REQUIRE --- REQUIRE | 1 + 1 file changed, 1 insertion(+) diff --git a/REQUIRE b/REQUIRE index 34cb1fc..4c638da 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,3 +1,4 @@ julia 0.7 DataDeps 0.5.1 AutoHashEquals +HDF5 From fd870280d8b251e6ee03ce25380f6f4e7638a1f8 Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Mon, 17 Sep 2018 17:26:59 +0200 Subject: [PATCH 3/5] Bugfixes --- src/conceptnet.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/conceptnet.jl b/src/conceptnet.jl index 39728c5..6a9c175 100644 --- a/src/conceptnet.jl +++ b/src/conceptnet.jl @@ -2,19 +2,22 @@ abstract type ConceptNet{LANG} <: EmbeddingSystem{LANG} end function init(::Type{ConceptNet}) - for (source, link, hashstring, post_fetch_method, lang) in [ + for (source, link, file, hashstring, post_fetch_method, lang) in [ ("Multilingual", "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz", + "numberbatch-17.06.txt", "375ac41e8f9caab172fc169e54e8658be23ece6cd92188bf700a99330cd1a81b", DataDeps.unpack, :multi), ("English", "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz", + "numberbatch-en-17.06.txt", "72faf0a487c61b9a6a8c9ff0a1440d2f4936bb19102bddf27a833c2567620f2d", DataDeps.unpack, :en), ("Compressed", "http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5", + "mini.h5", "87f8b48fd01088b72013dfbcc3160f0d0e16942797959e0824b9b69f31a7b222", identity, :compressed) @@ -37,14 +40,14 @@ function init(::Type{ConceptNet}) post_fetch_method=post_fetch_method )) - push!(language_files(ConceptNet{lang}), "ConceptNet_$(source)/$(join(split(link, "/")[end]))") + push!(language_files(ConceptNet{lang}), "ConceptNet_$(source)/$(file)") end end function _load_embeddings(::Type{<:ConceptNet}, embedding_file, max_vocab_size, keep_words) local LL, indexed_words, index - if any(endswith.(file, [".h5", ".hdf5"])) + if any(endswith.(embedding_file, [".h5", ".hdf5"])) LL, indexed_words = _load_hdf5_embeddings(embedding_file, max_vocab_size=max_vocab_size, keep_words=keep_words) From 66639c0d81e33c8511acd79f49cb2074c484f1c6 Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Mon, 17 Sep 2018 17:54:53 +0200 Subject: [PATCH 4/5] Another round of bugfixes --- src/conceptnet.jl | 32 +++++++++++++++++--------------- test/runtests.jl | 4 ++-- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/conceptnet.jl b/src/conceptnet.jl index 6a9c175..0ee084b 100644 --- a/src/conceptnet.jl +++ b/src/conceptnet.jl @@ -10,7 +10,7 @@ function init(::Type{ConceptNet}) DataDeps.unpack, :multi), ("English", - "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz", + "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz", "numberbatch-en-17.06.txt", "72faf0a487c61b9a6a8c9ff0a1440d2f4936bb19102bddf27a833c2567620f2d", DataDeps.unpack, @@ -47,15 +47,10 @@ end function _load_embeddings(::Type{<:ConceptNet}, embedding_file, max_vocab_size, keep_words) local LL, indexed_words, index - if any(endswith.(embedding_file, [".h5", ".hdf5"])) - LL, indexed_words = _load_hdf5_embeddings(embedding_file, - max_vocab_size=max_vocab_size, - keep_words=keep_words) - else - LL, indexed_words = _load_txt_embeddings(embedding_file, - max_vocab_size=max_vocab_size, - keep_words=keep_words) - end + _loader = ifelse(any(endswith.(embedding_file, [".h5", ".hdf5"])), + _load_hdf5_embeddings, + _load_txt_embeddings) + LL, indexed_words = _loader(embedding_file, max_vocab_size, keep_words) return LL, indexed_words end @@ -69,6 +64,7 @@ end function _load_txt_embeddings(file::AbstractString, max_vocab_size, keep_words) + local LL, indexed_words open(file, "r") do fid vocab_size, vector_size = map(x->parse(Int,x), split(readline(fid))) max_stored_vocab_size = _get_vocab_size(vocab_size, max_vocab_size) @@ -84,18 +80,23 @@ function _load_txt_embeddings(file::AbstractString, max_vocab_size, keep_words) return word, embedding end + # TODO Improve performance of this bit cnt = 0 + indices = Int[] for (index, row) in enumerate(data) word, embedding = _parseline(row) + LL[:, index] = embedding + indexed_words[index] = word if length(keep_words)==0 || word in keep_words - LL[:, index] = embedding - idexed_words[index] = word + push!(indices, index) cnt+=1 - if cnt > max_stored_vocab_size + if cnt > max_stored_vocab_size-1 break end end end + LL = LL[:, indices] + indexed_words = indexed_words[indices] end return LL, indexed_words end @@ -106,14 +107,15 @@ function _load_hdf5_embeddings(file::AbstractString, max_vocab_size, keep_words) payload = h5open(read, file)["mat"] words = payload["axis1"] vectors = payload["block0_values"] - max_vocab_size = _get_vocab_size(length(words), max_vocab_size) + max_stored_vocab_size = _get_vocab_size(length(words), max_vocab_size) indices = Int[] + cnt = 0 for (index, word) in enumerate(words) if length(keep_words)==0 || word in keep_words push!(indices, index) cnt+=1 - if cnt > max_stored_vocab_size + if cnt > max_stored_vocab_size-1 break end end diff --git a/test/runtests.jl b/test/runtests.jl index 917a8c1..b7fa198 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -191,9 +191,9 @@ end @test embs_mini.vocab == embs_full.vocab[1:100] embs_specific = load_embeddings(ConceptNet{:compressed}; - keep_words=Set(["red", "green", "blue"])) + keep_words=Set(["/c/en/red", "/c/en/green", "/c/en/blue"])) @test size(embs_specific.embeddings) == (300, 3) - @test Set(embs_specific.vocab) == Set(["red", "green", "blue"]) + @test Set(embs_specific.vocab) == Set(["/c/en/red", "/c/en/green", "/c/en/blue"]) end end From 9d864413793af89d0299442a43d1545e938bc79b Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Wed, 19 Sep 2018 10:12:19 +0200 Subject: [PATCH 5/5] Implemented review remarks, #14 --- src/conceptnet.jl | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/conceptnet.jl b/src/conceptnet.jl index 0ee084b..e45a33f 100644 --- a/src/conceptnet.jl +++ b/src/conceptnet.jl @@ -68,26 +68,29 @@ function _load_txt_embeddings(file::AbstractString, max_vocab_size, keep_words) open(file, "r") do fid vocab_size, vector_size = map(x->parse(Int,x), split(readline(fid))) max_stored_vocab_size = _get_vocab_size(vocab_size, max_vocab_size) - data = readlines(fid) indexed_words = Vector{String}(undef, max_stored_vocab_size) - LL = Array{Float32}(undef, vector_size, max_stored_vocab_size) - _parseline = (buf)-> begin + LL = Array{Float64}(undef, vector_size, max_stored_vocab_size) + + function _parseline(buf; word_only=false) bufvec = split(buf, " ") word = string(popfirst!(bufvec)) - embedding = parse.(Float64, bufvec) - #embedding = map(x->parse(Float64,x), bufvec) - return word, embedding + if word_only + return word, Float64[] + else + embedding = parse.(Float64, bufvec) + return word, embedding + end end - # TODO Improve performance of this bit cnt = 0 indices = Int[] - for (index, row) in enumerate(data) - word, embedding = _parseline(row) - LL[:, index] = embedding - indexed_words[index] = word + for (index, row) in enumerate(eachline(fid)) + word, _ = _parseline(row, word_only=true) if length(keep_words)==0 || word in keep_words + _, embedding = _parseline(row) + LL[:, index] = embedding + indexed_words[index] = word push!(indices, index) cnt+=1 if cnt > max_stored_vocab_size-1