From 1117e29262d6d1ae04851034b8a6e55e7d340f62 Mon Sep 17 00:00:00 2001 From: weisu Date: Mon, 30 Jan 2023 17:11:51 +0800 Subject: [PATCH 1/4] [bug fix]: fix bug of multiple losses of rank model --- .git_bin_url | 5 +---- easy_rec/version.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.git_bin_url b/.git_bin_url index dde02f4c0..da4f7404d 100644 --- a/.git_bin_url +++ b/.git_bin_url @@ -37,8 +37,5 @@ {"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"} {"leaf_path": "data/test/mt_ckpt", "sig": "803499f48e2df5e51ce5606e9649c6d4", "remote_path": "data/git_oss_sample_data/data_test_mt_ckpt_803499f48e2df5e51ce5606e9649c6d4"} {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"} -{"leaf_path": "data/test/tb_data", "sig": "f1279ca42de1734be321e88f85775d5f", "remote_path": "data/git_oss_sample_data/data_test_tb_data_f1279ca42de1734be321e88f85775d5f"} -{"leaf_path": "data/test/tb_data/hard_negative_sampler_edge", "sig": "48f994681d719a2546ec4003fcbc638c", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_edge_48f994681d719a2546ec4003fcbc638c"} -{"leaf_path": "data/test/tb_data/hard_negative_sampler_item", "sig": "f23a9eb9457c14a8e57b455804b1f013", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_item_f23a9eb9457c14a8e57b455804b1f013"} -{"leaf_path": "data/test/tb_data/hard_negative_sampler_user", "sig": "23514156eae5a4250ac1d0a118883430", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_user_23514156eae5a4250ac1d0a118883430"} +{"leaf_path": "data/test/tb_data", "sig": "126c375d6aa666633fb3084aa27ff9f7", "remote_path": "data/git_oss_sample_data/data_test_tb_data_126c375d6aa666633fb3084aa27ff9f7"} {"leaf_path": "data/test/tb_data_with_time", "sig": "1a7648f4ae55faf37855762bccbb70cc", "remote_path": "data/git_oss_sample_data/data_test_tb_data_with_time_1a7648f4ae55faf37855762bccbb70cc"} diff --git a/easy_rec/version.py b/easy_rec/version.py index 8127003c0..6e00ca21f 100644 --- a/easy_rec/version.py +++ b/easy_rec/version.py @@ -1,3 +1,3 @@ # -*- encoding:utf-8 -*- # Copyright (c) Alibaba, Inc. and its affiliates. -__version__ = '0.6.0' +__version__ = '0.6.1' From e455c3ccb4e707916774e3078c28d158fb8e4ed0 Mon Sep 17 00:00:00 2001 From: weisu Date: Tue, 7 Feb 2023 11:38:28 +0800 Subject: [PATCH 2/4] [feat]: add combiner for raw feature --- .git_bin_url | 5 ++++- easy_rec/python/input/input.py | 39 +++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/.git_bin_url b/.git_bin_url index da4f7404d..dde02f4c0 100644 --- a/.git_bin_url +++ b/.git_bin_url @@ -37,5 +37,8 @@ {"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"} {"leaf_path": "data/test/mt_ckpt", "sig": "803499f48e2df5e51ce5606e9649c6d4", "remote_path": "data/git_oss_sample_data/data_test_mt_ckpt_803499f48e2df5e51ce5606e9649c6d4"} {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"} -{"leaf_path": "data/test/tb_data", "sig": "126c375d6aa666633fb3084aa27ff9f7", "remote_path": "data/git_oss_sample_data/data_test_tb_data_126c375d6aa666633fb3084aa27ff9f7"} +{"leaf_path": "data/test/tb_data", "sig": "f1279ca42de1734be321e88f85775d5f", "remote_path": "data/git_oss_sample_data/data_test_tb_data_f1279ca42de1734be321e88f85775d5f"} +{"leaf_path": "data/test/tb_data/hard_negative_sampler_edge", "sig": "48f994681d719a2546ec4003fcbc638c", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_edge_48f994681d719a2546ec4003fcbc638c"} +{"leaf_path": "data/test/tb_data/hard_negative_sampler_item", "sig": "f23a9eb9457c14a8e57b455804b1f013", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_item_f23a9eb9457c14a8e57b455804b1f013"} +{"leaf_path": "data/test/tb_data/hard_negative_sampler_user", "sig": "23514156eae5a4250ac1d0a118883430", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_user_23514156eae5a4250ac1d0a118883430"} {"leaf_path": "data/test/tb_data_with_time", "sig": "1a7648f4ae55faf37855762bccbb70cc", "remote_path": "data/git_oss_sample_data/data_test_tb_data_with_time_1a7648f4ae55faf37855762bccbb70cc"} diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py index 85acc5e72..739024486 100644 --- a/easy_rec/python/input/input.py +++ b/easy_rec/python/input/input.py @@ -482,7 +482,44 @@ def _parse_raw_feature(self, fc, parsed_dict, field_dict): input_0 = fc.input_names[0] feature_name = fc.feature_name if fc.HasField('feature_name') else input_0 if field_dict[input_0].dtype == tf.string: - if fc.raw_input_dim > 1: + + def combine(x): + seq = tf.string_split([x], fc.seq_multi_sep) + seq_len = tf.size(seq) + if fc.raw_input_dim > 1: + check_list = [ + tf.py_func( + check_split, + [seq.values, fc.separator, fc.raw_input_dim, input_0], + Tout=tf.bool) + ] if self._check_mode else [] + with tf.control_dependencies(check_list): + emb = tf.string_split(seq.values, fc.separator).values + else: + emb = seq.values + check_list = [ + tf.py_func(check_string_to_number, [emb, input_0], Tout=tf.bool) + ] if self._check_mode else [] + with tf.control_dependencies(check_list): + emb_val = tf.string_to_number(emb) + emb_vec = tf.reshape(emb_val, [seq_len, -1]) + + if fc.combiner == 'max': + emb_vec = tf.reduce_max(emb_vec, axis=0) + elif fc.combiner == 'min': + emb_vec = tf.reduce_min(emb_vec, axis=0) + elif fc.combiner == 'sum': + emb_vec = tf.reduce_sum(emb_vec, axis=0) + elif fc.combiner == 'mean': + emb_vec = tf.reduce_mean(emb_vec, axis=0) + else: + assert False, 'unsupported combine operator: ' + fc.combiner + return emb_vec + + if fc.HasField('seq_multi_sep') and fc.HasField('combiner'): + parsed_dict[feature_name] = tf.map_fn( + combine, field_dict[input_0], dtype=tf.float32) + elif fc.raw_input_dim > 1: check_list = [ tf.py_func( check_split, From 7ac85371c013234a3c11b8f800ef92d076e57a3a Mon Sep 17 00:00:00 2001 From: weisu Date: Tue, 7 Feb 2023 13:57:29 +0800 Subject: [PATCH 3/4] [feat]: add combiner for raw feature --- docs/source/feature/feature.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/feature/feature.rst b/docs/source/feature/feature.rst index f9c03559a..12c55dc44 100644 --- a/docs/source/feature/feature.rst +++ b/docs/source/feature/feature.rst @@ -171,6 +171,25 @@ RawFeature:连续值特征 - raw_input_dim: 指定embedding特征的维度 +还支持多个embedding特征的聚合操作,如"0.23\|-0.123\|0.923\|-2.123;2.3\|0\|0\|12.33;0\|-1.23\|0.023\|0.32" + +.. code:: protobuf + feature_config:{ + features { + input_names: "pic_emb" + feature_type: RawFeature + separator: '|' + raw_input_dim: 4 + seq_multi_sep: ";" + combiner: "max" + } + } + +- seq_multi_sep: 指定多个embedding序列的分隔符 +- combiner: 指定多个embedding序列的聚合方式,可选值:min, max, mean, sum + +上面例子聚合之后的结果为:`2.3\|0\|0.923\|12.33` + TagFeature ---------------------------------------------------------------- From 769a03aa3976984e1e5f4709c13d14a72d0cedc0 Mon Sep 17 00:00:00 2001 From: weisu Date: Thu, 9 Feb 2023 10:13:08 +0800 Subject: [PATCH 4/4] [feat]: fix the problem of can not display code in document --- docs/source/feature/feature.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/source/feature/feature.rst b/docs/source/feature/feature.rst index 12c55dc44..f688c742c 100644 --- a/docs/source/feature/feature.rst +++ b/docs/source/feature/feature.rst @@ -143,6 +143,7 @@ RawFeature:连续值特征 也可以手动导入分箱信息。如下: .. code:: protobuf + feature_config:{ features { input_names: "ctr" @@ -160,6 +161,7 @@ RawFeature:连续值特征 这里同样支持embedding特征,如"0.233\|0.123\|0.023\|2.123\|0.233\|0.123\|0.023\|2.123" .. code:: protobuf + feature_config:{ features { input_names: "pic_emb" @@ -174,6 +176,7 @@ RawFeature:连续值特征 还支持多个embedding特征的聚合操作,如"0.23\|-0.123\|0.923\|-2.123;2.3\|0\|0\|12.33;0\|-1.23\|0.023\|0.32" .. code:: protobuf + feature_config:{ features { input_names: "pic_emb" @@ -186,9 +189,9 @@ RawFeature:连续值特征 } - seq_multi_sep: 指定多个embedding序列的分隔符 -- combiner: 指定多个embedding序列的聚合方式,可选值:min, max, mean, sum +- combiner: 指定多个embedding序列的聚合方式,可选值:``min, max, mean, sum`` -上面例子聚合之后的结果为:`2.3\|0\|0.923\|12.33` +上面例子聚合之后的结果为:"2.3\|0\|0.923\|12.33" TagFeature ----------------------------------------------------------------