From 3516853d80a03ef9876eb945e3755c559ed58faa Mon Sep 17 00:00:00 2001 From: jinyus Date: Wed, 8 Nov 2023 21:04:38 -0500 Subject: [PATCH 1/4] init impl --- gleam/.github/workflows/test.yml | 24 +++++++ gleam/.gitignore | 4 ++ gleam/README.md | 22 ++++++ gleam/gleam.toml | 20 ++++++ gleam/manifest.toml | 19 ++++++ gleam/src/json.gleam | 66 ++++++++++++++++++ gleam/src/related.gleam | 113 +++++++++++++++++++++++++++++++ gleam/src/types.gleam | 7 ++ gleam/test/related_test.gleam | 12 ++++ 9 files changed, 287 insertions(+) create mode 100644 gleam/.github/workflows/test.yml create mode 100644 gleam/.gitignore create mode 100644 gleam/README.md create mode 100644 gleam/gleam.toml create mode 100644 gleam/manifest.toml create mode 100644 gleam/src/json.gleam create mode 100644 gleam/src/related.gleam create mode 100644 gleam/src/types.gleam create mode 100644 gleam/test/related_test.gleam diff --git a/gleam/.github/workflows/test.yml b/gleam/.github/workflows/test.yml new file mode 100644 index 00000000..7cccb7f8 --- /dev/null +++ b/gleam/.github/workflows/test.yml @@ -0,0 +1,24 @@ +name: test + +on: + # push: + # branches: + # - master + # - main + # pull_request: + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: erlef/setup-beam@v1 + with: + otp-version: "26.0.2" + gleam-version: "0.32.2" + rebar3-version: "3" + # elixir-version: "1.15.4" + - run: gleam format --check src test + - run: gleam deps download + - run: gleam test diff --git a/gleam/.gitignore b/gleam/.gitignore new file mode 100644 index 00000000..170cca98 --- /dev/null +++ b/gleam/.gitignore @@ -0,0 +1,4 @@ +*.beam +*.ez +build +erl_crash.dump diff --git a/gleam/README.md b/gleam/README.md new file mode 100644 index 00000000..a864a5a6 --- /dev/null +++ b/gleam/README.md @@ -0,0 +1,22 @@ +# related + +[![Package Version](https://img.shields.io/hexpm/v/related)](https://hex.pm/packages/related) +[![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/related/) + +## Quick start + +```sh +gleam run # Run the project +gleam test # Run the tests +gleam shell # Run an Erlang shell +``` + +## Installation + +If available on Hex this package can be added to your Gleam project: + +```sh +gleam add related +``` + +and its documentation can be found at . diff --git a/gleam/gleam.toml b/gleam/gleam.toml new file mode 100644 index 00000000..3da9df33 --- /dev/null +++ b/gleam/gleam.toml @@ -0,0 +1,20 @@ +name = "related" +version = "0.1.0" +gleam = ">= 0.32.0" + +# Fill out these fields if you intend to generate HTML documentation or publish +# your project to the Hex package manager. +# +# description = "" +# licences = ["Apache-2.0"] +# repository = { type = "github", user = "username", repo = "project" } +# links = [{ title = "Website", href = "https://gleam.run" }] + +[dependencies] +gleam_stdlib = "~> 0.32" +gleam_json = "~> 0.7" +simplifile = "~> 0.2" +birl = "~> 0.17" + +[dev-dependencies] +gleeunit = "~> 1.0" diff --git a/gleam/manifest.toml b/gleam/manifest.toml new file mode 100644 index 00000000..f83d7de9 --- /dev/null +++ b/gleam/manifest.toml @@ -0,0 +1,19 @@ +# This file was generated by Gleam +# You typically do not need to edit this file + +packages = [ + { name = "birl", version = "0.17.0", build_tools = ["gleam"], requirements = ["ranger", "gleam_stdlib"], otp_app = "birl", source = "hex", outer_checksum = "AEA55D2329E13E72CB7FCC0FBAE640C453E6C9BF45B8D9DB4D7457F7F5C18F49" }, + { name = "gleam_json", version = "0.7.0", build_tools = ["gleam"], requirements = ["thoas", "gleam_stdlib"], otp_app = "gleam_json", source = "hex", outer_checksum = "CB405BD93A8828BCD870463DE29375E7B2D252D9D124C109E5B618AAC00B86FC" }, + { name = "gleam_stdlib", version = "0.32.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "07D64C26D014CF570F8ACADCE602761EA2E74C842D26F2FD49B0D61973D9966F" }, + { name = "gleeunit", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "D3682ED8C5F9CAE1C928F2506DE91625588CC752495988CBE0F5653A42A6F334" }, + { name = "ranger", version = "0.5.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "ranger", source = "hex", outer_checksum = "653A15D0C73E75AD4EC6DE40478F0EBBA1157D0F5BB16FDD454D0C1129C32D41" }, + { name = "simplifile", version = "0.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "856DD0CD5FEEB464FB32522F6C9C51F5DE1398799C17028D3645EDC4B732E7DB" }, + { name = "thoas", version = "0.4.1", build_tools = ["rebar3"], requirements = [], otp_app = "thoas", source = "hex", outer_checksum = "4918D50026C073C4AB1388437132C77A6F6F7C8AC43C60C13758CC0ADCE2134E" }, +] + +[requirements] +birl = { version = "~> 0.17" } +gleam_json = { version = "~> 0.7" } +gleam_stdlib = { version = "~> 0.32" } +gleeunit = { version = "~> 1.0" } +simplifile = { version = "~> 0.2" } diff --git a/gleam/src/json.gleam b/gleam/src/json.gleam new file mode 100644 index 00000000..08203a95 --- /dev/null +++ b/gleam/src/json.gleam @@ -0,0 +1,66 @@ +import gleam/json +import gleam/dynamic +import types.{type Post, type RelatedPost} + +pub fn post_to_object(post: Post) -> json.Json { + json.object([ + #("_id", json.string(post.id)), + #("title", json.string(post.title)), + #("tags", json.array(post.tags, of: json.string)), + ]) +} + +pub fn post_to_json(post: Post) -> String { + post_to_object(post) + |> json.to_string +} + +pub fn related_post_to_json(related_post: RelatedPost) -> String { + json.object([ + #("_id", json.string(related_post.id)), + #("tags", json.array(related_post.tags, of: json.string)), + #("related", json.array(related_post.related, of: post_to_object)), + ]) + |> json.to_string +} + +pub fn related_post_to_object(related_post: RelatedPost) -> json.Json { + json.object([ + #("_id", json.string(related_post.id)), + #("tags", json.array(related_post.tags, of: json.string)), + #("related", json.array(related_post.related, of: post_to_object)), + ]) +} + +pub fn related_posts_to_json(related_posts: List(RelatedPost)) -> String { + json.array(related_posts, of: related_post_to_object) + |> json.to_string +} + +pub fn posts_from_json( + json_string: String, +) -> Result(List(Post), json.DecodeError) { + let post_decoder = + dynamic.decode3( + types.Post, + dynamic.field("_id", of: dynamic.string), + dynamic.field("title", of: dynamic.string), + dynamic.field("tags", of: dynamic.list(dynamic.string)), + ) + + json.decode(from: json_string, using: dynamic.list(post_decoder)) +} +// import myapp.{Cat} +// import gleam/json +// import gleam/dynamic.{field, list, int, string} + +// pub fn cat_from_json(json_string: String) -> Result(Cat, json.DecodeError) { +// let cat_decoder = dynamic.decode3( +// Cat, +// field("name", of: string), +// field("lives", of: int), +// field("nicknames", of: list(string)), +// ) + +// json.decode(from: json_string, using: cat_decoder) +// } diff --git a/gleam/src/related.gleam b/gleam/src/related.gleam new file mode 100644 index 00000000..0ea2e6b3 --- /dev/null +++ b/gleam/src/related.gleam @@ -0,0 +1,113 @@ +import gleam/io +import gleam/int +import gleam/list +import gleam/map +import gleam/option.{None, Some} +import gleam/result +import types +import simplifile.{read, write} +import json +import birl/time + +// very slow; needs work; takes 7s for 1 post +// no mutability, too much copying in hot loop +// sort on linked list too slow + +const top_n = 5 + +pub fn main() { + let assert Ok(posts_raw) = read(from: "../posts.json") + let assert Ok(posts) = json.posts_from_json(posts_raw) + + let start = time.monotonic_now() + + let tag_map: map.Map(String, List(Int)) = + posts + |> list.index_fold( + map.new(), + fn(acc, post, i) { + list.fold( + post.tags, + acc, + fn(acc, tag) { + map.update( + acc, + tag, + fn(val) { + case val { + None -> [i] + Some(post_ids) -> list.append(post_ids, [i]) + } + }, + ) + }, + ) + }, + ) + + let all_related = + posts + |> list.take(1) + |> list.index_map(fn(i, post) { + // no arrays, must use maps + let tagged_post_count_temp = + posts + |> list.index_map(fn(i, _) { #(i, 0) }) + |> map.from_list + + let tagged_post_count = + posts + |> list.fold( + tagged_post_count_temp, + fn(acc, _) { + post.tags + |> list.fold( + acc, + fn(acc, tag) { + tag_map + |> map.get(tag) + |> result.unwrap(or: []) + |> list.fold( + acc, + fn(acc, o_idx) { + let assert Ok(current_count) = + acc + |> map.get(o_idx) + + acc + |> map.insert(o_idx, current_count + 1) + }, + ) + }, + ) + }, + ) + + let related_posts = + tagged_post_count + |> map.to_list + |> list.filter(fn(a) { a.1 > 0 && a.0 != i }) + |> list.sort(fn(a, b) { int.compare(b.1, a.1) }) + |> list.take(top_n) + |> list.map(fn(a) { + let assert Ok(post) = + posts + |> list.at(a.0) + post + }) + + types.RelatedPost(id: post.id, tags: post.tags, related: related_posts) + }) + + let end = time.monotonic_now() + + let took = + { end - start } / 1000 + |> int.to_string + + io.println("Processing time (w/o IO): " <> { took } <> "ms") + + let all_related_json = json.related_posts_to_json(all_related) + + let assert Ok(_) = write(all_related_json, "../related_posts_gleam.json") +} diff --git a/gleam/src/types.gleam b/gleam/src/types.gleam new file mode 100644 index 00000000..bb246c21 --- /dev/null +++ b/gleam/src/types.gleam @@ -0,0 +1,7 @@ +pub type Post { + Post(id: String, title: String, tags: List(String)) +} + +pub type RelatedPost { + RelatedPost(id: String, tags: List(String), related: List(Post)) +} diff --git a/gleam/test/related_test.gleam b/gleam/test/related_test.gleam new file mode 100644 index 00000000..3831e7a9 --- /dev/null +++ b/gleam/test/related_test.gleam @@ -0,0 +1,12 @@ +import gleeunit +import gleeunit/should + +pub fn main() { + gleeunit.main() +} + +// gleeunit test functions end in `_test` +pub fn hello_world_test() { + 1 + |> should.equal(1) +} From 3be4991d3c30ec78242ba0cb4e02dfce48aff863 Mon Sep 17 00:00:00 2001 From: jinyus Date: Wed, 8 Nov 2023 21:09:15 -0500 Subject: [PATCH 2/4] tidy --- gleam/src/json.gleam | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/gleam/src/json.gleam b/gleam/src/json.gleam index 08203a95..33eda7d2 100644 --- a/gleam/src/json.gleam +++ b/gleam/src/json.gleam @@ -50,17 +50,3 @@ pub fn posts_from_json( json.decode(from: json_string, using: dynamic.list(post_decoder)) } -// import myapp.{Cat} -// import gleam/json -// import gleam/dynamic.{field, list, int, string} - -// pub fn cat_from_json(json_string: String) -> Result(Cat, json.DecodeError) { -// let cat_decoder = dynamic.decode3( -// Cat, -// field("name", of: string), -// field("lives", of: int), -// field("nicknames", of: list(string)), -// ) - -// json.decode(from: json_string, using: cat_decoder) -// } From c33b23d5ba3b543e357c308daf5212dee7de76a0 Mon Sep 17 00:00:00 2001 From: jinyus Date: Thu, 9 Nov 2023 08:03:09 -0500 Subject: [PATCH 3/4] minor refactor --- gleam/src/related.gleam | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/gleam/src/related.gleam b/gleam/src/related.gleam index 0ea2e6b3..f4415852 100644 --- a/gleam/src/related.gleam +++ b/gleam/src/related.gleam @@ -70,12 +70,8 @@ pub fn main() { |> list.fold( acc, fn(acc, o_idx) { - let assert Ok(current_count) = - acc - |> map.get(o_idx) - - acc - |> map.insert(o_idx, current_count + 1) + let assert Ok(current_count) = map.get(acc, o_idx) + map.insert(acc, o_idx, current_count + 1) }, ) }, @@ -90,9 +86,7 @@ pub fn main() { |> list.sort(fn(a, b) { int.compare(b.1, a.1) }) |> list.take(top_n) |> list.map(fn(a) { - let assert Ok(post) = - posts - |> list.at(a.0) + let assert Ok(post) = list.at(posts, a.0) post }) From 714a04bd325cb9d086ba94763d02a6434db72fe2 Mon Sep 17 00:00:00 2001 From: jinyus Date: Tue, 14 Nov 2023 09:12:14 -0500 Subject: [PATCH 4/4] use more suitable algo: 7s -> 8ms Co-authored-by: giacomocavalieri --- gleam/src/json.gleam | 52 ---------------- gleam/src/post.gleam | 69 +++++++++++++++++++++ gleam/src/related.gleam | 129 ++++++++++++++-------------------------- gleam/src/types.gleam | 7 --- 4 files changed, 112 insertions(+), 145 deletions(-) delete mode 100644 gleam/src/json.gleam create mode 100644 gleam/src/post.gleam delete mode 100644 gleam/src/types.gleam diff --git a/gleam/src/json.gleam b/gleam/src/json.gleam deleted file mode 100644 index 33eda7d2..00000000 --- a/gleam/src/json.gleam +++ /dev/null @@ -1,52 +0,0 @@ -import gleam/json -import gleam/dynamic -import types.{type Post, type RelatedPost} - -pub fn post_to_object(post: Post) -> json.Json { - json.object([ - #("_id", json.string(post.id)), - #("title", json.string(post.title)), - #("tags", json.array(post.tags, of: json.string)), - ]) -} - -pub fn post_to_json(post: Post) -> String { - post_to_object(post) - |> json.to_string -} - -pub fn related_post_to_json(related_post: RelatedPost) -> String { - json.object([ - #("_id", json.string(related_post.id)), - #("tags", json.array(related_post.tags, of: json.string)), - #("related", json.array(related_post.related, of: post_to_object)), - ]) - |> json.to_string -} - -pub fn related_post_to_object(related_post: RelatedPost) -> json.Json { - json.object([ - #("_id", json.string(related_post.id)), - #("tags", json.array(related_post.tags, of: json.string)), - #("related", json.array(related_post.related, of: post_to_object)), - ]) -} - -pub fn related_posts_to_json(related_posts: List(RelatedPost)) -> String { - json.array(related_posts, of: related_post_to_object) - |> json.to_string -} - -pub fn posts_from_json( - json_string: String, -) -> Result(List(Post), json.DecodeError) { - let post_decoder = - dynamic.decode3( - types.Post, - dynamic.field("_id", of: dynamic.string), - dynamic.field("title", of: dynamic.string), - dynamic.field("tags", of: dynamic.list(dynamic.string)), - ) - - json.decode(from: json_string, using: dynamic.list(post_decoder)) -} diff --git a/gleam/src/post.gleam b/gleam/src/post.gleam new file mode 100644 index 00000000..0ff63137 --- /dev/null +++ b/gleam/src/post.gleam @@ -0,0 +1,69 @@ +import gleam/dynamic.{type DecodeError, type Decoder, type Dynamic} +import gleam/json.{type Json} +import gleam/set.{type Set} +import gleam/result + +pub type Post { + Post(id: PostId, title: String, tags: Set(Tag)) +} + +pub opaque type Tag { + Tag(string: String) +} + +pub opaque type PostId { + PostId(string: String) +} + +pub type RelatedPost { + RelatedPost(id: PostId, tags: List(Tag), related: List(Post)) +} + +// JSON ENCODERS --------------------------------------------------------------- + +pub fn related_to_json(related_post: RelatedPost) -> Json { + json.object([ + #("_id", id_to_json(related_post.id)), + #("tags", json.array(related_post.tags, of: tag_to_json)), + #("related", json.array(related_post.related, of: post_to_json)), + ]) +} + +fn post_to_json(post: Post) -> Json { + let Post(id, title, tags) = post + json.object([ + #("_id", id_to_json(id)), + #("title", json.string(title)), + #("tags", json.array(set.to_list(tags), of: tag_to_json)), + ]) +} + +fn tag_to_json(tag: Tag) -> Json { + let Tag(tag) = tag + json.string(tag) +} + +fn id_to_json(id: PostId) -> Json { + let PostId(id) = id + json.string(id) +} + +// DECODERS -------------------------------------------------------------------- + +pub fn decode(data: Dynamic) -> Result(Post, List(DecodeError)) { + let post_id_decoder = fn(data) { result.map(dynamic.string(data), PostId) } + let tag_decoder = fn(data) { result.map(dynamic.string(data), Tag) } + dynamic.decode3( + Post, + dynamic.field("_id", of: post_id_decoder), + dynamic.field("title", of: dynamic.string), + dynamic.field("tags", of: decode_set_from_list(tag_decoder)), + )(data) +} + +fn decode_set_from_list(with decoder: Decoder(a)) -> Decoder(Set(a)) { + fn(data) { + dynamic.list(of: decoder)(data) + |> result.map(set.from_list) + } +} diff --git a/gleam/src/related.gleam b/gleam/src/related.gleam index f4415852..93cd662a 100644 --- a/gleam/src/related.gleam +++ b/gleam/src/related.gleam @@ -1,107 +1,64 @@ +import gleam/dynamic import gleam/io import gleam/int import gleam/list -import gleam/map -import gleam/option.{None, Some} -import gleam/result -import types +import gleam/bool +import gleam/set +import gleam/pair +import gleam/float import simplifile.{read, write} -import json import birl/time +import gleam/json +import post.{type Post, type RelatedPost, RelatedPost} -// very slow; needs work; takes 7s for 1 post -// no mutability, too much copying in hot loop -// sort on linked list too slow +fn to_related_post(post: Post, posts: List(Post)) -> RelatedPost { + RelatedPost( + post.id, + set.to_list(post.tags), + similar_posts(to: post, among: posts), + ) +} + +fn similar_posts(to post: Post, among posts: List(Post)) -> List(Post) { + let other_posts_with_similarity = { + use other_post <- list.filter_map(posts) + use <- bool.guard(when: other_post.id == post.id, return: Error(Nil)) + let similarity = count_shared_tags(post, other_post) + Ok(#(similarity, other_post)) + } + take_top_n(other_posts_with_similarity) +} + +fn count_shared_tags(one: Post, other: Post) -> Int { + set.size(set.intersection(one.tags, other.tags)) +} const top_n = 5 +fn take_top_n(list: List(#(Int, a))) -> List(a) { + list.sort(list, fn(one, other) { int.compare(other.0, one.0) }) + |> list.take(top_n) + |> list.map(pair.second) +} + pub fn main() { - let assert Ok(posts_raw) = read(from: "../posts.json") - let assert Ok(posts) = json.posts_from_json(posts_raw) + let assert Ok(raw_posts) = read("../posts.json") + let assert Ok(posts) = + json.decode(raw_posts, using: dynamic.list(post.decode)) let start = time.monotonic_now() - - let tag_map: map.Map(String, List(Int)) = - posts - |> list.index_fold( - map.new(), - fn(acc, post, i) { - list.fold( - post.tags, - acc, - fn(acc, tag) { - map.update( - acc, - tag, - fn(val) { - case val { - None -> [i] - Some(post_ids) -> list.append(post_ids, [i]) - } - }, - ) - }, - ) - }, - ) - - let all_related = + let related_posts = posts |> list.take(1) - |> list.index_map(fn(i, post) { - // no arrays, must use maps - let tagged_post_count_temp = - posts - |> list.index_map(fn(i, _) { #(i, 0) }) - |> map.from_list - - let tagged_post_count = - posts - |> list.fold( - tagged_post_count_temp, - fn(acc, _) { - post.tags - |> list.fold( - acc, - fn(acc, tag) { - tag_map - |> map.get(tag) - |> result.unwrap(or: []) - |> list.fold( - acc, - fn(acc, o_idx) { - let assert Ok(current_count) = map.get(acc, o_idx) - map.insert(acc, o_idx, current_count + 1) - }, - ) - }, - ) - }, - ) - - let related_posts = - tagged_post_count - |> map.to_list - |> list.filter(fn(a) { a.1 > 0 && a.0 != i }) - |> list.sort(fn(a, b) { int.compare(b.1, a.1) }) - |> list.take(top_n) - |> list.map(fn(a) { - let assert Ok(post) = list.at(posts, a.0) - post - }) - - types.RelatedPost(id: post.id, tags: post.tags, related: related_posts) - }) - + |> list.map(to_related_post(_, posts)) let end = time.monotonic_now() - let took = - { end - start } / 1000 - |> int.to_string - + let took = float.to_string(int.to_float(end - start) /. 1000.0) io.println("Processing time (w/o IO): " <> { took } <> "ms") - let all_related_json = json.related_posts_to_json(all_related) + let all_related_json = + json.array(related_posts, of: post.related_to_json) + |> json.to_string let assert Ok(_) = write(all_related_json, "../related_posts_gleam.json") } diff --git a/gleam/src/types.gleam b/gleam/src/types.gleam deleted file mode 100644 index bb246c21..00000000 --- a/gleam/src/types.gleam +++ /dev/null @@ -1,7 +0,0 @@ -pub type Post { - Post(id: String, title: String, tags: List(String)) -} - -pub type RelatedPost { - RelatedPost(id: String, tags: List(String), related: List(Post)) -}