Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Paragram Embedding #30

Merged
merged 13 commits into from
Feb 8, 2020
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ version = "0.4.1"
AutoHashEquals = "15f4f7f2-30c1-5605-9d31-71845cf9641f"
DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
GoogleDrive = "91feb7a0-3508-11ea-1e8e-afea2c1c9a19"

[compat]
AutoHashEquals = "0.2"
DataDeps = "0.5, 0.6, 0.7"
julia = "1"
GoogleDrive="0.1.0"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
julia 0.7
DataDeps 0.5.1
AutoHashEquals
GoogleDrive
6 changes: 4 additions & 2 deletions src/Embeddings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@ using DataDeps
using AutoHashEquals

export load_embeddings, language_files
export Word2Vec, GloVe, FastText_Text
export Word2Vec, GloVe, FastText_Text, Paragram

abstract type EmbeddingSystem{LANG} end
include("fasttext.jl")
include("glove.jl")
include("word2vec.jl")
include("Paragram.jl")
include("common.jl")

function __init__()
for T in [Word2Vec, GloVe, FastText]
for T in [Word2Vec, GloVe, FastText, Paragram]
init(T)
end
end
Expand Down
44 changes: 44 additions & 0 deletions src/Paragram.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
abstract type Paragram{LANG} <: EmbeddingSystem{LANG} end
using GoogleDrive
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved

function init(::Type{Paragram})
vectors = [("paragram_300_ws353",
" Paragram-WS353 300 dimensional Paragram embeddings tuned on WordSim353 dataset. 1.7 GB download.",
"8ed9a19f8bc400cdbca5dae7f024c0310a3e5711a46ba48036a7542614440721",
"https://drive.google.com/uc?id=0B9w48e1rj-MOLVdZRzFfTlNsem8&export=download"),
# "https://drive.google.com/file/d/0B9w48e1rj-MOLVdZRzFfTlNsem8/view"),
("paragram_300_sl999",
"Paragram-SL999 300 dimensional Paragram embeddings tuned on SimLex999 dataset. 1.7 GB download.",
"9a16adc7d620642f863278451db4c03a2646016440ccea7e30a37ba17868781d",
"https://drive.google.com/uc?id=0B9w48e1rj-MOck1fRGxaZW1LU2M&export=download"),
]
#https://drive.google.com/file/d/0B9w48e1rj-MOLVdZRzFfTlNsem8/view?usp=sharing
for (depname, description, sha, link) in vectors
register(DataDep(depname,
"""
Pretrained Paragram word embeddings.
Website: https://www.cs.cmu.edu/~jwieting/
Author: John Wieting
Year: 2015
Licence: Open Data Commons Public Domain Dedication and License (PDDL)

Paper:
John Wieting, Mohit Bansal, Kevin Gimpel, Karen Livescu, Dan Roth. From Paraphrase Database to Compositional Paraphrase Model and Back.

$description
""",
link,
sha,
fetch_method = google_download,
post_fetch_method = unpack))

append!(language_files(Paragram{:en}), ["$(depname)/$(depname).txt"])



end
end

_load_embeddings(::Type{<:Paragram}, embedding_file, max_vocab_size, keep_words) = _load_embeddings_csv( embedding_file, max_vocab_size, keep_words)


28 changes: 28 additions & 0 deletions src/common.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

function _load_embeddings_csv( embedding_file, max_vocab_size, keep_words)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can also use this for FastText maybe? there is at least some code overlap.
Might need to factor out the common inner parts.

We should probably not call it a CSV since it is space seperated,
Or we call it a CSV and pass in a deliminator option

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deliminator option is added as suggested

local LL, indexed_words, index
if length(keep_words) > 0
max_vocab_size = length(keep_words)
end
indexed_words = Vector{String}()
LL = Vector{Vector{Float32}}()
open(embedding_file) do f
index = 1
for line in eachline(f)
xs = split(line, ' ')
word = xs[1]
if length(keep_words) == 0 || (word in keep_words)
index > max_vocab_size && break
push!(indexed_words, word)
try
push!(LL, parse.(Float32, @view(xs[2:end])))
catch err
err isa ArgumentError || rethrow()
@warn "Could not parse word vector" index word exception=err
end
index += 1
end
end
end
return reduce(hcat, LL), indexed_words
end
29 changes: 2 additions & 27 deletions src/glove.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,30 +44,5 @@ function init(::Type{GloVe})
end
end

function _load_embeddings(::Type{<:GloVe}, embedding_file, max_vocab_size, keep_words)
local LL, indexed_words, index
if length(keep_words) > 0
max_vocab_size = length(keep_words)
end
indexed_words = Vector{String}()
LL = Vector{Vector{Float32}}()
open(embedding_file) do f
index = 1
for line in eachline(f)
xs = split(line, ' ')
word = xs[1]
if length(keep_words) == 0 || (word in keep_words)
index > max_vocab_size && break
push!(indexed_words, word)
try
push!(LL, parse.(Float32, @view(xs[2:end])))
catch err
err isa ArgumentError || rethrow()
@warn "Could not parse word vector" index word exception=err
end
index += 1
end
end
end
return reduce(hcat, LL), indexed_words
end
_load_embeddings(::Type{<:GloVe}, embedding_file, max_vocab_size, keep_words) = _load_embeddings_csv(embedding_file, max_vocab_size, keep_words)