From 1403998c632c89d3b09edf4749c6b3a1da6da544 Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Mon, 13 Nov 2023 21:15:36 -0500 Subject: [PATCH 1/9] get_label cleanup --- src/learn_to_pick/pick_best.py | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index e0b53fc..20cb55f 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -29,7 +29,6 @@ def __init__( self.probability = probability self.score = score - class PickBestEvent(base.Event[PickBestSelected]): def __init__( self, @@ -38,7 +37,7 @@ def __init__( based_on: Dict[str, Any], selected: Optional[PickBestSelected] = None, ): - super().__init__(inputs=inputs, selected=selected) + super().__init__(inputs=inputs, selected=selected or PickBestSelected()) self.to_select_from = to_select_from self.based_on = based_on @@ -86,20 +85,6 @@ def __init__( self.model = model self.auto_embed = auto_embed - def get_label(self, event: PickBestEvent) -> tuple: - cost = None - if event.selected: - chosen_action = event.selected.index - cost = ( - -1.0 * event.selected.score - if event.selected.score is not None - else None - ) - prob = event.selected.probability - return chosen_action, cost, prob - else: - return None, None, None - def get_context_and_action_embeddings(self, event: PickBestEvent) -> tuple: context_emb = base.embed(event.based_on, self.model) if event.based_on else None to_select_from_var_name, to_select_from = next( @@ -163,7 +148,6 @@ def get_indexed_dot_product(self, context_emb: List, action_embs: List) -> Dict: return indexed_dot_product def format_auto_embed_on(self, event: PickBestEvent) -> str: - chosen_action, cost, prob = self.get_label(event) context_emb, action_embs = self.get_context_and_action_embeddings(event) indexed_dot_product = self.get_indexed_dot_product(context_emb, action_embs) @@ -172,9 +156,10 @@ def format_auto_embed_on(self, event: PickBestEvent) -> str: def _tolist(v): return v if isinstance(v, list) else [v] + selected = event.selected labels = ["" for _ in range(nactions)] - if cost is not None: - labels[chosen_action] = f"{chosen_action}:{cost}:{prob} " + if selected.score is not None: + labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " dotprods = [{} for _ in range(nactions)] for i, action in enumerate(action_embs): @@ -198,15 +183,15 @@ def format_auto_embed_off(self, event: PickBestEvent) -> str: """ Converts the `BasedOn` and `ToSelectFrom` into a format that can be used by VW """ - chosen_action, cost, prob = self.get_label(event) context_emb, action_embs = self.get_context_and_action_embeddings(event) nactions = len(action_embs) context_str = f"shared {VwTxt.ns(context_emb)}" + selected = event.selected labels = ["" for _ in range(nactions)] - if cost is not None: - labels[chosen_action] = f"{chosen_action}:{cost}:{prob} " + if selected.score is not None: + labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " actions_str = [f"{l}{VwTxt.ns(a)}" for a, l in zip(action_embs, labels)] return "\n".join([context_str] + actions_str) @@ -294,8 +279,7 @@ def _call_after_predict_before_scoring( sampled_ap = prediction[sampled_index] sampled_action = sampled_ap[0] sampled_prob = sampled_ap[1] - selected = PickBestSelected(index=sampled_action, probability=sampled_prob) - event.selected = selected + event.selected = PickBestSelected(index=sampled_action, probability=sampled_prob) next_inputs = inputs.copy() From 6d791cabdf466519cedbddc39f1a935660576250 Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Mon, 13 Nov 2023 21:17:09 -0500 Subject: [PATCH 2/9] private methods in featurizer --- src/learn_to_pick/pick_best.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index 20cb55f..711affa 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -85,7 +85,7 @@ def __init__( self.model = model self.auto_embed = auto_embed - def get_context_and_action_embeddings(self, event: PickBestEvent) -> tuple: + def _get_context_and_action_embeddings(self, event: PickBestEvent) -> tuple: context_emb = base.embed(event.based_on, self.model) if event.based_on else None to_select_from_var_name, to_select_from = next( iter(event.to_select_from.items()), (None, None) @@ -107,7 +107,7 @@ def get_context_and_action_embeddings(self, event: PickBestEvent) -> tuple: ) return context_emb, action_embs - def get_indexed_dot_product(self, context_emb: List, action_embs: List) -> Dict: + def _get_indexed_dot_product(self, context_emb: List, action_embs: List) -> Dict: import numpy as np unique_contexts = set() @@ -147,9 +147,9 @@ def get_indexed_dot_product(self, context_emb: List, action_embs: List) -> Dict: return indexed_dot_product - def format_auto_embed_on(self, event: PickBestEvent) -> str: - context_emb, action_embs = self.get_context_and_action_embeddings(event) - indexed_dot_product = self.get_indexed_dot_product(context_emb, action_embs) + def _format_auto_embed_on(self, event: PickBestEvent) -> str: + context_emb, action_embs = self._get_context_and_action_embeddings(event) + indexed_dot_product = self._get_indexed_dot_product(context_emb, action_embs) nactions = len(action_embs) @@ -179,7 +179,7 @@ def _tolist(v): return "\n".join([shared_str] + actions_str) - def format_auto_embed_off(self, event: PickBestEvent) -> str: + def _format_auto_embed_off(self, event: PickBestEvent) -> str: """ Converts the `BasedOn` and `ToSelectFrom` into a format that can be used by VW """ @@ -197,9 +197,9 @@ def format_auto_embed_off(self, event: PickBestEvent) -> str: def format(self, event: PickBestEvent) -> str: if self.auto_embed: - return self.format_auto_embed_on(event) + return self._format_auto_embed_on(event) else: - return self.format_auto_embed_off(event) + return self._format_auto_embed_off(event) class PickBestRandomPolicy(base.Policy[PickBestEvent]): From 28f84041aaea8d05e6f7d0ad31612f12d1b079cd Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Tue, 14 Nov 2023 15:01:25 -0500 Subject: [PATCH 3/9] news recommendation is runnable --- src/learn_to_pick/base.py | 63 +++++------- src/learn_to_pick/features.py | 29 ++++++ src/learn_to_pick/pick_best.py | 169 +++++++++++---------------------- 3 files changed, 111 insertions(+), 150 deletions(-) create mode 100644 src/learn_to_pick/features.py diff --git a/src/learn_to_pick/base.py b/src/learn_to_pick/base.py index 6612b75..2f824f0 100644 --- a/src/learn_to_pick/base.py +++ b/src/learn_to_pick/base.py @@ -18,6 +18,8 @@ from learn_to_pick.metrics import MetricsTrackerAverage, MetricsTrackerRollingWindow from learn_to_pick.model_repository import ModelRepository from learn_to_pick.vw_logger import VwLogger +from learn_to_pick.features import Featurized, DenseFeatures, SparseFeatures +import numpy as np if TYPE_CHECKING: import vowpal_wabbit_next as vw @@ -108,7 +110,7 @@ def get_based_on_and_to_select_from(inputs: Dict[str, Any]) -> Tuple[Dict, Dict] ) based_on = { - k: inputs[k].value if isinstance(inputs[k].value, list) else [inputs[k].value] + k: inputs[k].value if isinstance(inputs[k].value, list) else inputs[k].value for k in inputs.keys() if isinstance(inputs[k], _BasedOn) } @@ -486,70 +488,57 @@ def run(self, *args, **kwargs) -> Dict[str, Any]: def _embed_string_type( - item: Union[str, _Embed], model: Any, namespace: Optional[str] = None -) -> Dict[str, Union[str, List[str]]]: + item: Union[str, _Embed], model: Any, namespace: str) -> Featurized: """Helper function to embed a string or an _Embed object.""" import re - - keep_str = "" + result = Featurized() if isinstance(item, _Embed): - encoded = _stringify_embedding(model.encode(item.value)) - # TODO these should be moved to pick_best + result[namespace] = model.encode(item.value) if item.keep: keep_str = item.value.replace(" ", "_") + " " - keep_str = re.sub(r"[\t\n\r\f\v]+", " ", keep_str) + result[namespace] = {'raw': re.sub(r"[\t\n\r\f\v]+", " ", keep_str)} elif isinstance(item, str): encoded = item.replace(" ", "_") - encoded = re.sub(r"[\t\n\r\f\v]+", " ", encoded) + result[namespace] = {'raw': re.sub(r"[\t\n\r\f\v]+", " ", encoded)} else: raise ValueError(f"Unsupported type {type(item)} for embedding") - if namespace is None: - raise ValueError( - "The default namespace must be provided when embedding a string or _Embed object." - ) - - return {namespace: keep_str + encoded} + return result -def _embed_dict_type(item: Dict, model: Any) -> Dict[str, Any]: +def _embed_dict_type(item: Dict, model: Any) -> Featurized: """Helper function to embed a dictionary item.""" - inner_dict: Dict = {} + result = Featurized() for ns, embed_item in item.items(): if isinstance(embed_item, list): - inner_dict[ns] = [] - for embed_list_item in embed_item: - embedded = _embed_string_type(embed_list_item, model, ns) - inner_dict[ns].append(embedded[ns]) + for idx, embed_list_item in enumerate(embed_item): + result.merge(_embed_string_type(embed_list_item, model, f'{idx}_{ns}')) else: - inner_dict.update(_embed_string_type(embed_item, model, ns)) - return inner_dict + result.merge(_embed_string_type(embed_item, model, ns)) + return result def _embed_list_type( item: list, model: Any, namespace: Optional[str] = None -) -> List[Dict[str, Union[str, List[str]]]]: - ret_list: List = [] +) -> List[Featurized]: + result = [] for embed_item in item: if isinstance(embed_item, dict): - ret_list.append(_embed_dict_type(embed_item, model)) + result.append(_embed_dict_type(embed_item, model)) elif isinstance(embed_item, list): - item_embedding = _embed_list_type(embed_item, model, namespace) - # Get the first key from the first dictionary - first_key = next(iter(item_embedding[0])) - # Group the values under that key - grouping = {first_key: [item[first_key] for item in item_embedding]} - ret_list.append(grouping) + result.append(Featurized()) + for idx, embed_list_item in enumerate(embed_item): + result[-1].merge(_embed_string_type(embed_list_item, model, f'{idx}')) else: - ret_list.append(_embed_string_type(embed_item, model, namespace)) - return ret_list + result.append(_embed_string_type(embed_item, model, namespace)) + return result def embed( to_embed: Union[Union[str, _Embed], Dict, List[Union[str, _Embed]], List[Dict]], model: Any, namespace: Optional[str] = None, -) -> List[Dict[str, Union[str, List[str]]]]: +) -> Union[Featurized, List[Featurized]]: """ Embeds the actions or context using the SentenceTransformer model (or a model that has an `encode` function) @@ -563,9 +552,9 @@ def embed( if (isinstance(to_embed, _Embed) and isinstance(to_embed.value, str)) or isinstance( to_embed, str ): - return [_embed_string_type(to_embed, model, namespace)] + return _embed_string_type(to_embed, model, namespace) elif isinstance(to_embed, dict): - return [_embed_dict_type(to_embed, model)] + return _embed_dict_type(to_embed, model) elif isinstance(to_embed, list): return _embed_list_type(to_embed, model, namespace) else: diff --git a/src/learn_to_pick/features.py b/src/learn_to_pick/features.py new file mode 100644 index 0000000..b03c808 --- /dev/null +++ b/src/learn_to_pick/features.py @@ -0,0 +1,29 @@ +from typing import Union, Optional, Dict, List +import numpy as np + +class SparseFeatures(dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + +class DenseFeatures(list): + def __init__(self, *args, **kwargs): + super().__init__(np.array(*args, **kwargs)) + + +class Featurized: + def __init__(self, sparse: Optional[Dict[str, SparseFeatures]] = None, dense: Optional[Dict[str, DenseFeatures]] = None): + self.sparse = sparse or {} + self.dense = dense or {} + + def __setitem__(self, key, value): + if isinstance(value, Dict): + self.sparse[key] = SparseFeatures(value) + elif isinstance(value, List) or isinstance(value, np.ndarray): + self.dense[key] = DenseFeatures(value) + else: + raise ValueError(f'Cannot convert {type(value)} to either DenseFeatures or SparseFeatures') + + def merge(self, other): + self.sparse.update(other.sparse) + self.dense.update(other.dense) diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index 711affa..c51bc08 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union, Iterable from itertools import chain import os +import numpy as np from learn_to_pick import base @@ -41,27 +42,43 @@ def __init__( self.to_select_from = to_select_from self.based_on = based_on + def context(self, model) -> base.Featurized: + return base.embed(self.based_on or [], model) -class VwTxt: - @staticmethod - def embedding(embedding: List[float]) -> str: - return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)]) + def actions(self, model) -> List[base.Featurized]: + to_select_from_var_name, to_select_from = next( + iter(self.to_select_from.items()), (None, None) + ) - @staticmethod - def features(features: Union[str, List[str]]) -> str: - return " ".join(features) if isinstance(features, list) else features + action_embs = ( + ( + base.embed(to_select_from, model, to_select_from_var_name) + if self.to_select_from + else None + ) + if to_select_from + else None + ) + if not action_embs: + raise ValueError( + "Context and to_select_from must be provided in the inputs dictionary" + ) + return action_embs +class VwTxt: @staticmethod - def _namespaces(ns: Iterable[Tuple[str, Union[str, List[str]]]]): - return " ".join(f"|{k} {VwTxt.features(v)}" for k, v in ns) - + def _dense_2_str(values: base.DenseFeatures) -> str: + return " ".join([f"{i}:{e}" for i, e in enumerate(values)]) + + @staticmethod + def _sparse_2_str(values: base.SparseFeatures) -> str: + return " ".join([f"{k}:{v}" for k, v in values.items()]) + @staticmethod - def ns(ns: Union[Iterable[Tuple[str, Any]], List[Dict[str, Any]], Dict[str, Any]]): - if isinstance(ns, List): - ns = chain.from_iterable(map(dict.items, ns)) - if isinstance(ns, Dict): - ns = ns.items() - return VwTxt._namespaces(ns) + def featurized_2_str(obj: base.Featurized) -> str: + return " ".join(chain.from_iterable([ + map(lambda kv: f'|{kv[0]} {VwTxt._dense_2_str(kv[1])}', obj.dense.items()), + map(lambda kv: f'|{kv[0]} {VwTxt._sparse_2_str(kv[1])}', obj.sparse.items())])) class PickBestFeaturizer(base.Featurizer[PickBestEvent]): @@ -85,121 +102,47 @@ def __init__( self.model = model self.auto_embed = auto_embed - def _get_context_and_action_embeddings(self, event: PickBestEvent) -> tuple: - context_emb = base.embed(event.based_on, self.model) if event.based_on else None - to_select_from_var_name, to_select_from = next( - iter(event.to_select_from.items()), (None, None) - ) - - action_embs = ( - ( - base.embed(to_select_from, self.model, to_select_from_var_name) - if event.to_select_from - else None - ) - if to_select_from - else None - ) - - if not context_emb or not action_embs: - raise ValueError( - "Context and to_select_from must be provided in the inputs dictionary" - ) - return context_emb, action_embs - - def _get_indexed_dot_product(self, context_emb: List, action_embs: List) -> Dict: - import numpy as np - - unique_contexts = set() - for context_item in context_emb: - for ns, ee in context_item.items(): - if isinstance(ee, list): - for ea in ee: - unique_contexts.add(f"{ns}={ea}") - else: - unique_contexts.add(f"{ns}={ee}") - - encoded_contexts = self.model.encode(list(unique_contexts)) - context_embeddings = dict(zip(unique_contexts, encoded_contexts)) - - unique_actions = set() - for action in action_embs: - for ns, e in action.items(): - if isinstance(e, list): - for ea in e: - unique_actions.add(f"{ns}={ea}") - else: - unique_actions.add(f"{ns}={e}") - - encoded_actions = self.model.encode(list(unique_actions)) - action_embeddings = dict(zip(unique_actions, encoded_actions)) - - action_matrix = np.stack([v for k, v in action_embeddings.items()]) - context_matrix = np.stack([v for k, v in context_embeddings.items()]) - dot_product_matrix = np.dot(context_matrix, action_matrix.T) - - indexed_dot_product: Dict = {} - - for i, context_key in enumerate(context_embeddings.keys()): - indexed_dot_product[context_key] = {} - for j, action_key in enumerate(action_embeddings.keys()): - indexed_dot_product[context_key][action_key] = dot_product_matrix[i, j] - - return indexed_dot_product - - def _format_auto_embed_on(self, event: PickBestEvent) -> str: - context_emb, action_embs = self._get_context_and_action_embeddings(event) - indexed_dot_product = self._get_indexed_dot_product(context_emb, action_embs) - + def _featurize_auto_embed_on(self, event: PickBestEvent) -> str: + context_emb = event.context(self.model) + action_embs = event.actions(self.model) + + context_names = list(context_emb.dense.keys()) + context_matrix = np.stack(list(context_emb.dense.values())) + for a in action_embs: + action_names = list(a.dense.keys()) + product = np.dot(context_matrix, np.stack(list(a.dense.values()).T)) + a['dotproduct'] = {f'{context_names[i]}_{action_names[j]}': product[i, j] for i in range(len(context_names)) for j in range(len(action_names))} + nactions = len(action_embs) - - def _tolist(v): - return v if isinstance(v, list) else [v] - + context_str = f"shared {VwTxt.featurized_2_str(context_emb)}" selected = event.selected labels = ["" for _ in range(nactions)] if selected.score is not None: labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " - - dotprods = [{} for _ in range(nactions)] - for i, action in enumerate(action_embs): - action["#"] = [f"{k}={v}" for k, _v in action.items() for v in _tolist(_v)] - dotprods[i] = [ - v[f] for v in indexed_dot_product.values() for f in action["#"] - ] - - actions_str = [ - f"{l}{VwTxt.ns(a)} |dotprod {VwTxt.embedding(dp)}" - for l, a, dp in zip(labels, action_embs, dotprods) - ] - - for item in context_emb: - item["@"] = [f"{k}={v}" for k, _v in item.items() for v in _tolist(_v)] - shared_str = f"shared {VwTxt.ns(context_emb)}" - - return "\n".join([shared_str] + actions_str) - - def _format_auto_embed_off(self, event: PickBestEvent) -> str: + actions_str = [f"{l}{VwTxt.featurized_2_str(a)}" for a, l in zip(action_embs, labels)] + return "\n".join([context_str] + actions_str) + + def _featurize_auto_embed_off(self, event: PickBestEvent) -> str: """ Converts the `BasedOn` and `ToSelectFrom` into a format that can be used by VW """ - context_emb, action_embs = self.get_context_and_action_embeddings(event) - nactions = len(action_embs) - - context_str = f"shared {VwTxt.ns(context_emb)}" + context_emb = event.context(self.model) + action_embs = event.actions(self.model) + nactions = len(action_embs) + context_str = f"shared {VwTxt.featurized_2_str(context_emb)}" selected = event.selected labels = ["" for _ in range(nactions)] if selected.score is not None: labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " - actions_str = [f"{l}{VwTxt.ns(a)}" for a, l in zip(action_embs, labels)] + actions_str = [f"{l}{VwTxt.featurized_2_str(a)}" for a, l in zip(action_embs, labels)] return "\n".join([context_str] + actions_str) def format(self, event: PickBestEvent) -> str: if self.auto_embed: - return self._format_auto_embed_on(event) + return self._featurize_auto_embed_on(event) else: - return self._format_auto_embed_off(event) + return self._featurize_auto_embed_off(event) class PickBestRandomPolicy(base.Policy[PickBestEvent]): From c34f23525ca8c56b6403d2f94676a5b45e373273 Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Tue, 14 Nov 2023 22:51:20 -0500 Subject: [PATCH 4/9] tests fix --- src/learn_to_pick/__init__.py | 1 - src/learn_to_pick/base.py | 10 +- src/learn_to_pick/pick_best.py | 80 +-- tests/unit_tests/test_pick_best_call.py | 34 +- .../test_pick_best_text_embedder.py | 184 ++++--- .../unit_tests/test_rl_loop_base_embedder.py | 470 +++++++++--------- tests/unit_tests/test_utils.py | 2 +- 7 files changed, 423 insertions(+), 358 deletions(-) diff --git a/src/learn_to_pick/__init__.py b/src/learn_to_pick/__init__.py index a6894b3..dcdb105 100644 --- a/src/learn_to_pick/__init__.py +++ b/src/learn_to_pick/__init__.py @@ -53,5 +53,4 @@ def configure_logger() -> None: "VwPolicy", "VwLogger", "embed", - "stringify_embedding", ] diff --git a/src/learn_to_pick/base.py b/src/learn_to_pick/base.py index 2f824f0..c5dbf80 100644 --- a/src/learn_to_pick/base.py +++ b/src/learn_to_pick/base.py @@ -89,10 +89,6 @@ def EmbedAndKeep(anything: Any) -> Any: # helper functions -def _stringify_embedding(embedding: List) -> str: - return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)]) - - def _parse_lines(parser: "vw.TextFormatParser", input_str: str) -> List["vw.Example"]: return [parser.parse_line(line) for line in input_str.split("\n")] @@ -493,9 +489,9 @@ def _embed_string_type( import re result = Featurized() if isinstance(item, _Embed): - result[namespace] = model.encode(item.value) + result[namespace] = DenseFeatures(model.encode(item.value)) if item.keep: - keep_str = item.value.replace(" ", "_") + " " + keep_str = item.value.replace(" ", "_") result[namespace] = {'raw': re.sub(r"[\t\n\r\f\v]+", " ", keep_str)} elif isinstance(item, str): encoded = item.replace(" ", "_") @@ -512,7 +508,7 @@ def _embed_dict_type(item: Dict, model: Any) -> Featurized: for ns, embed_item in item.items(): if isinstance(embed_item, list): for idx, embed_list_item in enumerate(embed_item): - result.merge(_embed_string_type(embed_list_item, model, f'{idx}_{ns}')) + result.merge(_embed_string_type(embed_list_item, model, f'{ns}_{idx}')) else: result.merge(_embed_string_type(embed_item, model, ns)) return result diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index c51bc08..ebd682e 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -43,7 +43,7 @@ def __init__( self.based_on = based_on def context(self, model) -> base.Featurized: - return base.embed(self.based_on or [], model) + return base.embed(self.based_on or {}, model) def actions(self, model) -> List[base.Featurized]: to_select_from_var_name, to_select_from = next( @@ -69,16 +69,20 @@ class VwTxt: @staticmethod def _dense_2_str(values: base.DenseFeatures) -> str: return " ".join([f"{i}:{e}" for i, e in enumerate(values)]) - + @staticmethod def _sparse_2_str(values: base.SparseFeatures) -> str: - return " ".join([f"{k}:{v}" for k, v in values.items()]) + def _to_str(v): + import numbers + return v if isinstance(v, numbers.Number) else f'={v}' + + return " ".join([f"{k}:{_to_str(v)}" for k, v in values.items()]) @staticmethod def featurized_2_str(obj: base.Featurized) -> str: return " ".join(chain.from_iterable([ - map(lambda kv: f'|{kv[0]} {VwTxt._dense_2_str(kv[1])}', obj.dense.items()), - map(lambda kv: f'|{kv[0]} {VwTxt._sparse_2_str(kv[1])}', obj.sparse.items())])) + map(lambda kv: f'|{kv[0]}_dense {VwTxt._dense_2_str(kv[1])}', obj.dense.items()), + map(lambda kv: f'|{kv[0]}_sparse {VwTxt._sparse_2_str(kv[1])}', obj.sparse.items())])) class PickBestFeaturizer(base.Featurizer[PickBestEvent]): @@ -102,33 +106,45 @@ def __init__( self.model = model self.auto_embed = auto_embed - def _featurize_auto_embed_on(self, event: PickBestEvent) -> str: - context_emb = event.context(self.model) - action_embs = event.actions(self.model) + def _dotproducts(self, context, actions): + _context_dense = base.Featurized() + for ns in context.sparse.keys(): + if 'raw' in context.sparse[ns]: + _context_dense[ns] = self.model.encode(context.sparse[ns]['raw']) + + _actions_dense = [base.Featurized() for _ in range(len(actions))] + for _action, action in zip(_actions_dense, actions): + for ns in action.sparse.keys(): + if 'raw' in action.sparse[ns]: + _action[ns] = self.model.encode(action.sparse[ns]['raw']) + + context_names = list(_context_dense.dense.keys()) + context_matrix = np.stack(list(_context_dense.dense.values())) + for _a, a in zip(_actions_dense, actions): + action_names = list(_a.dense.keys()) + product = np.dot(context_matrix, np.stack(list(_a.dense.values())).T) + a['dotprod'] = {f'{context_names[i]}_{action_names[j]}': product[i, j] for i in range(len(context_names)) for j in range(len(action_names))} + + def _generic_namespace(self, featurized): + result = base.SparseFeatures() + for ns in featurized.sparse.keys(): + if 'raw' in featurized.sparse[ns]: + result[ns] = featurized.sparse[ns]['raw'] + return result + + def _generic_namespaces(self, context, actions): + context['@'] = self._generic_namespace(context) + for a in actions: + a['#'] = self._generic_namespace(a) - context_names = list(context_emb.dense.keys()) - context_matrix = np.stack(list(context_emb.dense.values())) - for a in action_embs: - action_names = list(a.dense.keys()) - product = np.dot(context_matrix, np.stack(list(a.dense.values()).T)) - a['dotproduct'] = {f'{context_names[i]}_{action_names[j]}': product[i, j] for i in range(len(context_names)) for j in range(len(action_names))} - - nactions = len(action_embs) - context_str = f"shared {VwTxt.featurized_2_str(context_emb)}" - selected = event.selected - labels = ["" for _ in range(nactions)] - if selected.score is not None: - labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " - actions_str = [f"{l}{VwTxt.featurized_2_str(a)}" for a, l in zip(action_embs, labels)] - return "\n".join([context_str] + actions_str) - - def _featurize_auto_embed_off(self, event: PickBestEvent) -> str: - """ - Converts the `BasedOn` and `ToSelectFrom` into a format that can be used by VW - """ + def format(self, event: PickBestEvent) -> str: context_emb = event.context(self.model) action_embs = event.actions(self.model) + if self.auto_embed: + self._dotproducts(context_emb, action_embs) + self._generic_namespaces(context_emb, action_embs) + nactions = len(action_embs) context_str = f"shared {VwTxt.featurized_2_str(context_emb)}" selected = event.selected @@ -137,13 +153,7 @@ def _featurize_auto_embed_off(self, event: PickBestEvent) -> str: labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " actions_str = [f"{l}{VwTxt.featurized_2_str(a)}" for a, l in zip(action_embs, labels)] return "\n".join([context_str] + actions_str) - - def format(self, event: PickBestEvent) -> str: - if self.auto_embed: - return self._featurize_auto_embed_on(event) - else: - return self._featurize_auto_embed_off(event) - + class PickBestRandomPolicy(base.Policy[PickBestEvent]): def __init__(self): diff --git a/tests/unit_tests/test_pick_best_call.py b/tests/unit_tests/test_pick_best_call.py index c2ef16e..43891bd 100644 --- a/tests/unit_tests/test_pick_best_call.py +++ b/tests/unit_tests/test_pick_best_call.py @@ -161,15 +161,16 @@ def test_everything_embedded() -> None: str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_loop._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_loop._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_loop._stringify_embedding(list(encoded_keyword + str3)) - + action_dense = "0:1.0 1:0.0" + ctx_str_1 = "context1" + encoded_ctx_str_1 = "0:8.0 1:0.0" - encoded_ctx_str_1 = rl_loop._stringify_embedding(list(encoded_keyword + ctx_str_1)) - - expected = f"""shared |User {ctx_str_1 + " " + encoded_ctx_str_1} \n|action {str1 + " " + encoded_str1} \n|action {str2 + " " + encoded_str2} \n|action {str3 + " " + encoded_str3} """ # noqa + expected = "\n".join([ + f"shared |User_dense {encoded_ctx_str_1} |User_sparse raw:={ctx_str_1}", + f"|action_dense {action_dense} |action_sparse raw:={str1}", + f"|action_dense {action_dense} |action_sparse raw:={str2}", + f"|action_dense {action_dense} |action_sparse raw:={str3}"]) # noqa actions = [str1, str2, str3] @@ -191,7 +192,11 @@ def test_default_auto_embedder_is_off() -> None: str3 = "2" ctx_str_1 = "context1" - expected = f"""shared |User {ctx_str_1} \n|action {str1} \n|action {str2} \n|action {str3} """ # noqa + expected = "\n".join([ + f"shared |User_sparse raw:={ctx_str_1}", + f"|action_sparse raw:={str1}", + f"|action_sparse raw:={str2}", + f"|action_sparse raw:={str3}"]) # noqa actions = [str1, str2, str3] @@ -213,7 +218,11 @@ def test_default_w_embeddings_off() -> None: str3 = "2" ctx_str_1 = "context1" - expected = f"""shared |User {ctx_str_1} \n|action {str1} \n|action {str2} \n|action {str3} """ # noqa + expected = "\n".join([ + f"shared |User_sparse raw:={ctx_str_1}", + f"|action_sparse raw:={str1}", + f"|action_sparse raw:={str2}", + f"|action_sparse raw:={str3}"]) # noqa actions = [str1, str2, str3] @@ -235,9 +244,12 @@ def test_default_w_embeddings_on() -> None: str1 = "0" str2 = "1" ctx_str_1 = "context1" - dot_prod = "dotprod 0:5.0" # dot prod of [1.0, 2.0] and [1.0, 2.0] + dot_prod = "dotprod_sparse User_action:5.0" # dot prod of [1.0, 2.0] and [1.0, 2.0] - expected = f"""shared |User {ctx_str_1} |@ User={ctx_str_1}\n|action {str1} |# action={str1} |{dot_prod}\n|action {str2} |# action={str2} |{dot_prod}""" # noqa + expected = "\n".join([ + f"shared |User_sparse raw:={ctx_str_1} |@_sparse User:={ctx_str_1}", + f"|action_sparse raw:={str1} |{dot_prod} |#_sparse action:={str1} ", + f"|action_sparse raw:={str2} |{dot_prod} |#_sparse action:={str2} "]) # noqa actions = [str1, str2] diff --git a/tests/unit_tests/test_pick_best_text_embedder.py b/tests/unit_tests/test_pick_best_text_embedder.py index 46c4f9e..0d8b47d 100644 --- a/tests/unit_tests/test_pick_best_text_embedder.py +++ b/tests/unit_tests/test_pick_best_text_embedder.py @@ -4,10 +4,8 @@ import learn_to_pick.base as rl_chain import learn_to_pick.pick_best as pick_best_chain -encoded_keyword = "[encoded]" - -def test_pickbest_textembedder_missing_context_throws() -> None: +def test_pickbest_textembedder_missing_context_not_throws() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) @@ -15,8 +13,7 @@ def test_pickbest_textembedder_missing_context_throws() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_action, based_on={} ) - with pytest.raises(ValueError): - featurizer.format(event) + featurizer.format(event) def test_pickbest_textembedder_missing_actions_throws() -> None: @@ -34,8 +31,13 @@ def test_pickbest_textembedder_no_label_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": ["0", "1", "2"]} - expected = """shared |context context \n|action1 0 \n|action1 1 \n|action1 2 """ + named_actions = {"action": ["0", "1", "2"]} + expected = "\n".join([ + "shared |context_sparse raw:=context", + "|action_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2"]) + event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on={"context": "context"} ) @@ -47,8 +49,12 @@ def test_pickbest_textembedder_w_label_no_score_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": ["0", "1", "2"]} - expected = """shared |context context \n|action1 0 \n|action1 1 \n|action1 2 """ + named_actions = {"action": ["0", "1", "2"]} + expected = "\n".join([ + "shared |context_sparse raw:=context", + "|action_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2"]) selected = pick_best_chain.PickBestSelected(index=0, probability=1.0) event = pick_best_chain.PickBestEvent( inputs={}, @@ -64,10 +70,13 @@ def test_pickbest_textembedder_w_full_label_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": ["0", "1", "2"]} - expected = ( - """shared |context context \n0:-0.0:1.0 |action1 0 \n|action1 1 \n|action1 2 """ - ) + named_actions = {"action": ["0", "1", "2"]} + expected = "\n".join([ + "shared |context_sparse raw:=context", + "0:-0.0:1.0 |action_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2"]) + selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, @@ -86,16 +95,17 @@ def test_pickbest_textembedder_w_full_label_w_emb() -> None: str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_chain._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - encoded_ctx_str_1 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_1)) + ctx_str = "ctx" + encoded_ctx_str = "0:3.0 1:0.0" - named_actions = {"action1": rl_chain.Embed([str1, str2, str3])} - context = {"context": rl_chain.Embed(ctx_str_1)} - expected = f"""shared |context {encoded_ctx_str_1} \n0:-0.0:1.0 |action1 {encoded_str1} \n|action1 {encoded_str2} \n|action1 {encoded_str3} """ # noqa: E501 + named_actions = {"action": rl_chain.Embed([str1, str2, str3])} + context = {"context": rl_chain.Embed(ctx_str)} + expected = "\n".join([ + f"shared |context_dense {encoded_ctx_str}", + "0:-0.0:1.0 |action_dense 0:1.0 1:0.0", + "|action_dense 0:1.0 1:0.0", + "|action_dense 0:1.0 1:0.0"]) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -111,16 +121,17 @@ def test_pickbest_textembedder_w_full_label_w_embed_and_keep() -> None: str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_chain._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - encoded_ctx_str_1 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_1)) + ctx_str = "ctx" + encoded_ctx_str = "0:3.0 1:0.0" - named_actions = {"action1": rl_chain.EmbedAndKeep([str1, str2, str3])} - context = {"context": rl_chain.EmbedAndKeep(ctx_str_1)} - expected = f"""shared |context {ctx_str_1 + " " + encoded_ctx_str_1} \n0:-0.0:1.0 |action1 {str1 + " " + encoded_str1} \n|action1 {str2 + " " + encoded_str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501 + named_actions = {"action": rl_chain.EmbedAndKeep([str1, str2, str3])} + context = {"context": rl_chain.EmbedAndKeep(ctx_str)} + expected = "\n".join([ + f"shared |context_dense {encoded_ctx_str} |context_sparse raw:={ctx_str}", + "0:-0.0:1.0 |action_dense 0:1.0 1:0.0 |action_sparse raw:=0", + "|action_dense 0:1.0 1:0.0 |action_sparse raw:=1", + "|action_dense 0:1.0 1:0.0 |action_sparse raw:=2"]) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -135,7 +146,11 @@ def test_pickbest_textembedder_more_namespaces_no_label_no_emb() -> None: ) named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = """shared |context1 context1 |context2 context2 \n|a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501 + expected = "\n".join([ + "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2 ", + "|a_sparse raw:=0 |b_sparse raw:=0", + "|action1_sparse raw:=1", + "|action1_sparse raw:=2"]) # noqa: E501 event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) @@ -147,9 +162,13 @@ def test_pickbest_textembedder_more_namespaces_w_label_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]} + named_actions = {"action": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = """shared |context1 context1 |context2 context2 \n|a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501 + expected = "\n".join([ + "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2", + "|a_sparse raw:=0 |b_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2"]) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -162,9 +181,13 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]} + named_actions = {"action": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = """shared |context1 context1 |context2 context2 \n0:-0.0:1.0 |a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501 + expected = "\n".join([ + "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2", + "0:-0.0:1.0 |a_sparse raw:=0 |b_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2"]) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -181,21 +204,22 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_emb() -> None str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_chain._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - ctx_str_2 = "context2" - encoded_ctx_str_1 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_1)) - encoded_ctx_str_2 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_2)) + ctx_str_1 = "ctx" + ctx_str_2 = "ctx_" + encoded_ctx_str_1 = "0:3.0 1:0.0" + encoded_ctx_str_2 = "0:4.0 1:0.0" - named_actions = {"action1": rl_chain.Embed([{"a": str1, "b": str1}, str2, str3])} + named_actions = {"action": rl_chain.Embed([{"a": str1, "b": str1}, str2, str3])} context = { "context1": rl_chain.Embed(ctx_str_1), "context2": rl_chain.Embed(ctx_str_2), } - expected = f"""shared |context1 {encoded_ctx_str_1} |context2 {encoded_ctx_str_2} \n0:-0.0:1.0 |a {encoded_str1} |b {encoded_str1} \n|action1 {encoded_str2} \n|action1 {encoded_str3} """ # noqa: E501 + expected = "\n".join([ + f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2}", + f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0", + f"|action_dense 0:1.0 1:0.0", + f"|action_dense 0:1.0 1:0.0"]) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( @@ -215,24 +239,25 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_embed_and_kee str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_chain._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - ctx_str_2 = "context2" - encoded_ctx_str_1 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_1)) - encoded_ctx_str_2 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_2)) + ctx_str_1 = "ctx" + ctx_str_2 = "ctx_" + encoded_ctx_str_1 = "0:3.0 1:0.0" + encoded_ctx_str_2 = "0:4.0 1:0.0" named_actions = { - "action1": rl_chain.EmbedAndKeep([{"a": str1, "b": str1}, str2, str3]) + "action": rl_chain.EmbedAndKeep([{"a": str1, "b": str1}, str2, str3]) } context = { "context1": rl_chain.EmbedAndKeep(ctx_str_1), "context2": rl_chain.EmbedAndKeep(ctx_str_2), } - expected = f"""shared |context1 {ctx_str_1 + " " + encoded_ctx_str_1} |context2 {ctx_str_2 + " " + encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1 + " " + encoded_str1} |b {str1 + " " + encoded_str1} \n|action1 {str2 + " " + encoded_str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501 - + expected = "\n".join([ + f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1} |context2_sparse raw:={ctx_str_2}", + f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0 |b_sparse raw:=0", + f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=1", + f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=2"]) # noqa: E501 + selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -249,18 +274,21 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emb() -> N str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - ctx_str_2 = "context2" - encoded_ctx_str_2 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_2)) + ctx_str_1 = "ctx" + ctx_str_2 = "ctx_" + encoded_ctx_str_2 = "0:4.0 1:0.0" named_actions = { - "action1": [{"a": str1, "b": rl_chain.Embed(str1)}, str2, rl_chain.Embed(str3)] + "action": [{"a": str1, "b": rl_chain.Embed(str1)}, str2, rl_chain.Embed(str3)] } context = {"context1": ctx_str_1, "context2": rl_chain.Embed(ctx_str_2)} - expected = f"""shared |context1 {ctx_str_1} |context2 {encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1} |b {encoded_str1} \n|action1 {str2} \n|action1 {encoded_str3} """ # noqa: E501 + + expected = "\n".join([ + f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1}", + f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0", + f"|action_sparse raw:=1", + f"|action_dense 0:1.0 1:0.0"]) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( @@ -278,23 +306,24 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emakeep() str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - ctx_str_2 = "context2" - encoded_ctx_str_2 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_2)) + ctx_str_1 = "ctx" + ctx_str_2 = "ctx_" + encoded_ctx_str_2 = "0:4.0 1:0.0" named_actions = { - "action1": [ + "action": [ {"a": str1, "b": rl_chain.EmbedAndKeep(str1)}, str2, rl_chain.EmbedAndKeep(str3), ] } context = {"context1": ctx_str_1, "context2": rl_chain.EmbedAndKeep(ctx_str_2)} - expected = f"""shared |context1 {ctx_str_1} |context2 {ctx_str_2 + " " + encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1} |b {str1 + " " + encoded_str1} \n|action1 {str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501 - + expected = "\n".join([ + f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1} |context2_sparse raw:={ctx_str_2}", + f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0 |b_sparse raw:=0", + f"|action_sparse raw:=1", + f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=2"]) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -309,18 +338,19 @@ def test_raw_features_underscored() -> None: ) str1 = "this is a long string" str1_underscored = str1.replace(" ", "_") - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) + encoded_str1 = f"0:{float(len(str1))} 1:0.0" ctx_str = "this is a long context" ctx_str_underscored = ctx_str.replace(" ", "_") - encoded_ctx_str = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str)) + encoded_ctx_str = f"0:{float(len(ctx_str))} 1:0.0" # No embeddings named_actions = {"action": [str1]} context = {"context": ctx_str} - expected_no_embed = ( - f"""shared |context {ctx_str_underscored} \n|action {str1_underscored} """ - ) + expected_no_embed = "\n".join([ + f"shared |context_sparse raw:={ctx_str_underscored}", + f"|action_sparse raw:={str1_underscored}"]) + event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) @@ -330,7 +360,9 @@ def test_raw_features_underscored() -> None: # Just embeddings named_actions = {"action": rl_chain.Embed([str1])} context = {"context": rl_chain.Embed(ctx_str)} - expected_embed = f"""shared |context {encoded_ctx_str} \n|action {encoded_str1} """ + expected_embed = "\n".join([ + f"shared |context_dense {encoded_ctx_str}", + f"|action_dense {encoded_str1}"]) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) @@ -340,7 +372,9 @@ def test_raw_features_underscored() -> None: # Embeddings and raw features named_actions = {"action": rl_chain.EmbedAndKeep([str1])} context = {"context": rl_chain.EmbedAndKeep(ctx_str)} - expected_embed_and_keep = f"""shared |context {ctx_str_underscored + " " + encoded_ctx_str} \n|action {str1_underscored + " " + encoded_str1} """ # noqa: E501 + expected_embed_and_keep = "\n".join([ + f"shared |context_dense {encoded_ctx_str} |context_sparse raw:={ctx_str_underscored}", + f"|action_dense {encoded_str1} |action_sparse raw:={str1_underscored}"]) # noqa: E501 event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) diff --git a/tests/unit_tests/test_rl_loop_base_embedder.py b/tests/unit_tests/test_rl_loop_base_embedder.py index af2e2b5..c93f2df 100644 --- a/tests/unit_tests/test_rl_loop_base_embedder.py +++ b/tests/unit_tests/test_rl_loop_base_embedder.py @@ -5,174 +5,178 @@ import learn_to_pick.base as base -encoded_keyword = "[encoded]" - def test_simple_context_str_no_emb() -> None: - expected = [{"a_namespace": "test"}] - assert base.embed("test", MockEncoder(), "a_namespace") == expected + expected = {"a_namespace": {"raw": "test"}} + + featurized = base.embed("test", MockEncoder(), "a_namespace") + assert featurized.sparse == expected + assert featurized.dense == {} def test_simple_context_str_w_emb() -> None: str1 = "test" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - expected = [{"a_namespace": encoded_str1}] - assert base.embed(base.Embed(str1), MockEncoder(), "a_namespace") == expected - expected_embed_and_keep = [{"a_namespace": str1 + " " + encoded_str1}] - assert ( - base.embed(base.EmbedAndKeep(str1), MockEncoder(), "a_namespace") - == expected_embed_and_keep - ) + expected_dense = {"a_namespace": [4.0, 0.0]} + expected_sparse = {"a_namespace": {"raw": str1}} + + featurized = base.embed(base.Embed(str1), MockEncoder(), "a_namespace") + assert featurized.dense == expected_dense + assert featurized.sparse == {} + + featurized = base.embed(base.EmbedAndKeep(str1), MockEncoder(), "a_namespace") + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_simple_context_str_w_nested_emb() -> None: # nested embeddings, innermost wins str1 = "test" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - expected = [{"a_namespace": encoded_str1}] - assert ( - base.embed(base.EmbedAndKeep(base.Embed(str1)), MockEncoder(), "a_namespace") - == expected - ) + expected_dense = {"a_namespace": [4.0, 0.0]} + expected_sparse = {"a_namespace": {"raw": str1}} - expected2 = [{"a_namespace": str1 + " " + encoded_str1}] - assert ( - base.embed(base.Embed(base.EmbedAndKeep(str1)), MockEncoder(), "a_namespace") - == expected2 - ) + featurized = base.embed(base.EmbedAndKeep(base.Embed(str1)), MockEncoder(), "a_namespace") + assert featurized.dense == expected_dense + assert featurized.sparse == {} + featurized = base.embed(base.Embed(base.EmbedAndKeep(str1)), MockEncoder(), "a_namespace") + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense -def test_context_w_namespace_no_emb() -> None: - expected = [{"test_namespace": "test"}] - assert base.embed({"test_namespace": "test"}, MockEncoder()) == expected +def test_context_w_namespace_no_emb() -> None: + expected_sparse = {"test_namespace": {"raw": "test"}} + featurized = base.embed({"test_namespace": "test"}, MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == {} def test_context_w_namespace_w_emb() -> None: str1 = "test" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - expected = [{"test_namespace": encoded_str1}] - assert base.embed({"test_namespace": base.Embed(str1)}, MockEncoder()) == expected - expected_embed_and_keep = [{"test_namespace": str1 + " " + encoded_str1}] - assert ( - base.embed({"test_namespace": base.EmbedAndKeep(str1)}, MockEncoder()) - == expected_embed_and_keep - ) + expected_sparse = {"test_namespace": {"raw": str1}} + expected_dense = {"test_namespace": [4.0, 0.0]} + + featurized = base.embed({"test_namespace": base.Embed(str1)}, MockEncoder()) + assert featurized.sparse == {} + assert featurized.dense == expected_dense + + featurized = base.embed({"test_namespace": base.EmbedAndKeep(str1)}, MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_context_w_namespace_w_emb2() -> None: str1 = "test" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - expected = [{"test_namespace": encoded_str1}] - assert base.embed(base.Embed({"test_namespace": str1}), MockEncoder()) == expected - expected_embed_and_keep = [{"test_namespace": str1 + " " + encoded_str1}] - assert ( - base.embed(base.EmbedAndKeep({"test_namespace": str1}), MockEncoder()) - == expected_embed_and_keep - ) + expected_sparse = {"test_namespace": {"raw": str1}} + expected_dense = {"test_namespace": [4.0, 0.0]} + + featurized = base.embed(base.Embed({"test_namespace": str1}), MockEncoder()) + assert featurized.sparse == {} + assert featurized.dense == expected_dense + + featurized = base.embed(base.EmbedAndKeep({"test_namespace": str1}), MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_context_w_namespace_w_some_emb() -> None: - str1 = "test1" - str2 = "test2" - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - expected = [{"test_namespace": str1, "test_namespace2": encoded_str2}] - assert ( - base.embed( + str1 = "test" + str2 = "test_" + expected_sparse = {"test_namespace": {"raw": str1}} + expected_dense = {"test_namespace2": [5.0, 0.0]} + featurized = base.embed( {"test_namespace": str1, "test_namespace2": base.Embed(str2)}, MockEncoder() ) - == expected - ) - expected_embed_and_keep = [ - {"test_namespace": str1, "test_namespace2": str2 + " " + encoded_str2} - ] - assert ( - base.embed( + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense + + expected_sparse = {"test_namespace": {"raw": str1}, "test_namespace2": {"raw": str2}} + featurized = base.embed( {"test_namespace": str1, "test_namespace2": base.EmbedAndKeep(str2)}, MockEncoder(), ) - == expected_embed_and_keep - ) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_simple_action_strlist_no_emb() -> None: str1 = "test1" str2 = "test2" str3 = "test3" - expected = [{"a_namespace": str1}, {"a_namespace": str2}, {"a_namespace": str3}] + expected_sparse = [ + {"a_namespace": {"raw": str1}}, + {"a_namespace": {"raw": str2}}, + {"a_namespace": {"raw": str3}}] to_embed: List[Union[str, base._Embed]] = [str1, str2, str3] - assert base.embed(to_embed, MockEncoder(), "a_namespace") == expected + featurized = base.embed(to_embed, MockEncoder(), "a_namespace") + + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == {} def test_simple_action_strlist_w_emb() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"a_namespace": encoded_str1}, - {"a_namespace": encoded_str2}, - {"a_namespace": encoded_str3}, - ] - assert ( - base.embed(base.Embed([str1, str2, str3]), MockEncoder(), "a_namespace") - == expected - ) - expected_embed_and_keep = [ - {"a_namespace": str1 + " " + encoded_str1}, - {"a_namespace": str2 + " " + encoded_str2}, - {"a_namespace": str3 + " " + encoded_str3}, - ] - assert ( - base.embed(base.EmbedAndKeep([str1, str2, str3]), MockEncoder(), "a_namespace") - == expected_embed_and_keep - ) + str1 = "test" + str2 = "test_" + str3 = "test__" + + expected_sparse = [ + {"a_namespace": {"raw": str1}}, + {"a_namespace": {"raw": str2}}, + {"a_namespace": {"raw": str3}}] + expected_dense = [ + {"a_namespace": [4.0, 0.0]}, + {"a_namespace": [5.0, 0.0]}, + {"a_namespace": [6.0, 0.0]}] + + featurized = base.embed(base.Embed([str1, str2, str3]), MockEncoder(), "a_namespace") + for i in range(len(featurized)): + assert featurized[i].sparse == {} + assert featurized[i].dense == expected_dense[i] + + featurized = base.embed(base.EmbedAndKeep([str1, str2, str3]), MockEncoder(), "a_namespace") + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_simple_action_strlist_w_some_emb() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"a_namespace": str1}, - {"a_namespace": encoded_str2}, - {"a_namespace": encoded_str3}, - ] - assert ( - base.embed( - [str1, base.Embed(str2), base.Embed(str3)], MockEncoder(), "a_namespace" - ) - == expected - ) - expected_embed_and_keep = [ - {"a_namespace": str1}, - {"a_namespace": str2 + " " + encoded_str2}, - {"a_namespace": str3 + " " + encoded_str3}, - ] - assert ( - base.embed( - [str1, base.EmbedAndKeep(str2), base.EmbedAndKeep(str3)], - MockEncoder(), - "a_namespace", - ) - == expected_embed_and_keep - ) + str1 = "test" + str2 = "test_" + str3 = "test__" + + expected_sparse = [ + {"a_namespace": {"raw": str1}}, + {}, + {}] + expected_dense = [ + {}, + {"a_namespace": [5.0, 0.0]}, + {"a_namespace": [6.0, 0.0]}] + featurized = base.embed([str1, base.Embed(str2), base.Embed(str3)], MockEncoder(), "a_namespace") + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] + + featurized = base.embed([str1, base.EmbedAndKeep(str2), base.EmbedAndKeep(str3)], MockEncoder(), "a_namespace") + expected_sparse = [ + {"a_namespace": {"raw": str1}}, + {"a_namespace": {"raw": str2}}, + {"a_namespace": {"raw": str3}}] + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_action_w_namespace_no_emb() -> None: str1 = "test1" str2 = "test2" str3 = "test3" - expected = [ - {"test_namespace": str1}, - {"test_namespace": str2}, - {"test_namespace": str3}, + expected_sparse = [ + {"test_namespace": {"raw": str1}}, + {"test_namespace": {"raw": str2}}, + {"test_namespace": {"raw": str3}}, ] - assert ( - base.embed( + + featurized = base.embed( [ {"test_namespace": str1}, {"test_namespace": str2}, @@ -180,24 +184,26 @@ def test_action_w_namespace_no_emb() -> None: ], MockEncoder(), ) - == expected - ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == {} def test_action_w_namespace_w_emb() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"test_namespace": encoded_str1}, - {"test_namespace": encoded_str2}, - {"test_namespace": encoded_str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + expected_sparse = [ + {"test_namespace": {"raw": str1}}, + {"test_namespace": {"raw": str2}}, + {"test_namespace": {"raw": str3}}, ] - assert ( - base.embed( + expected_dense = [ + {"test_namespace": [4.0, 0.0]}, + {"test_namespace": [5.0, 0.0]}, + {"test_namespace": [6.0, 0.0]}] + + featurized = base.embed( [ {"test_namespace": base.Embed(str1)}, {"test_namespace": base.Embed(str2)}, @@ -205,15 +211,11 @@ def test_action_w_namespace_w_emb() -> None: ], MockEncoder(), ) - == expected - ) - expected_embed_and_keep = [ - {"test_namespace": str1 + " " + encoded_str1}, - {"test_namespace": str2 + " " + encoded_str2}, - {"test_namespace": str3 + " " + encoded_str3}, - ] - assert ( - base.embed( + for i in range(len(featurized)): + assert featurized[i].sparse == {} + assert featurized[i].dense == expected_dense[i] + + featurized = base.embed( [ {"test_namespace": base.EmbedAndKeep(str1)}, {"test_namespace": base.EmbedAndKeep(str2)}, @@ -221,42 +223,41 @@ def test_action_w_namespace_w_emb() -> None: ], MockEncoder(), ) - == expected_embed_and_keep - ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] + def test_action_w_namespace_w_emb2() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"test_namespace1": encoded_str1}, - {"test_namespace2": encoded_str2}, - {"test_namespace3": encoded_str3}, - ] - assert ( - base.embed( - base.Embed( - [ - {"test_namespace1": str1}, - {"test_namespace2": str2}, - {"test_namespace3": str3}, - ] - ), - MockEncoder(), - ) - == expected - ) - expected_embed_and_keep = [ - {"test_namespace1": str1 + " " + encoded_str1}, - {"test_namespace2": str2 + " " + encoded_str2}, - {"test_namespace3": str3 + " " + encoded_str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + expected_sparse = [ + {"test_namespace1": {"raw": str1}}, + {"test_namespace2": {"raw": str2}}, + {"test_namespace3": {"raw": str3}}, ] - assert ( - base.embed( + expected_dense = [ + {"test_namespace1": [4.0, 0.0]}, + {"test_namespace2": [5.0, 0.0]}, + {"test_namespace3": [6.0, 0.0]}] + + featurized = base.embed( + base.Embed( + [ + {"test_namespace1": str1}, + {"test_namespace2": str2}, + {"test_namespace3": str3}, + ] + ), + MockEncoder(), + ) + for i in range(len(featurized)): + assert featurized[i].sparse == {} + assert featurized[i].dense == expected_dense[i] + + featurized = base.embed( base.EmbedAndKeep( [ {"test_namespace1": str1}, @@ -266,23 +267,26 @@ def test_action_w_namespace_w_emb2() -> None: ), MockEncoder(), ) - == expected_embed_and_keep - ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_action_w_namespace_w_some_emb() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"test_namespace": str1}, - {"test_namespace": encoded_str2}, - {"test_namespace": encoded_str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + expected_sparse = [ + {"test_namespace": {"raw": str1}}, + {}, + {}, ] - assert ( - base.embed( + expected_dense = [ + {}, + {"test_namespace": [5.0, 0.0]}, + {"test_namespace": [6.0, 0.0]}] + + featurized = base.embed( [ {"test_namespace": str1}, {"test_namespace": base.Embed(str2)}, @@ -290,15 +294,16 @@ def test_action_w_namespace_w_some_emb() -> None: ], MockEncoder(), ) - == expected - ) - expected_embed_and_keep = [ - {"test_namespace": str1}, - {"test_namespace": str2 + " " + encoded_str2}, - {"test_namespace": str3 + " " + encoded_str3}, + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] + + expected_sparse = [ + {"test_namespace": {"raw": str1}}, + {"test_namespace": {"raw": str2}}, + {"test_namespace": {"raw": str3}}, ] - assert ( - base.embed( + featurized = base.embed( [ {"test_namespace": str1}, {"test_namespace": base.EmbedAndKeep(str2)}, @@ -306,24 +311,26 @@ def test_action_w_namespace_w_some_emb() -> None: ], MockEncoder(), ) - == expected_embed_and_keep - ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"test_namespace": encoded_str1, "test_namespace2": str1}, - {"test_namespace": encoded_str2, "test_namespace2": str2}, - {"test_namespace": encoded_str3, "test_namespace2": str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + expected_sparse = [ + {"test_namespace2": {"raw": str1}}, + {"test_namespace2": {"raw": str2}}, + {"test_namespace2": {"raw": str3}}, ] - assert ( - base.embed( + expected_dense = [ + {"test_namespace": [4.0, 0.0]}, + {"test_namespace": [5.0, 0.0]}, + {"test_namespace": [6.0, 0.0]}] + + featurized = base.embed( [ {"test_namespace": base.Embed(str1), "test_namespace2": str1}, {"test_namespace": base.Embed(str2), "test_namespace2": str2}, @@ -331,15 +338,16 @@ def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: ], MockEncoder(), ) - == expected - ) - expected_embed_and_keep = [ - {"test_namespace": str1 + " " + encoded_str1, "test_namespace2": str1}, - {"test_namespace": str2 + " " + encoded_str2, "test_namespace2": str2}, - {"test_namespace": str3 + " " + encoded_str3, "test_namespace2": str3}, + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] + + expected_sparse = [ + {"test_namespace": {"raw": str1}, "test_namespace2": {"raw": str1}}, + {"test_namespace": {"raw": str2}, "test_namespace2": {"raw": str2}}, + {"test_namespace": {"raw": str3}, "test_namespace2": {"raw": str3}}, ] - assert ( - base.embed( + featurized = base.embed( [ {"test_namespace": base.EmbedAndKeep(str1), "test_namespace2": str1}, {"test_namespace": base.EmbedAndKeep(str2), "test_namespace2": str2}, @@ -347,26 +355,32 @@ def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: ], MockEncoder(), ) - == expected_embed_and_keep - ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_one_namespace_w_list_of_features_no_emb() -> None: str1 = "test1" str2 = "test2" - expected = [{"test_namespace": [str1, str2]}] - assert base.embed({"test_namespace": [str1, str2]}, MockEncoder()) == expected + expected_sparse = { + "test_namespace_0": {"raw": str1}, + "test_namespace_1": {"raw": str2}} + + featurized = base.embed({"test_namespace": [str1, str2]}, MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == {} def test_one_namespace_w_list_of_features_w_some_emb() -> None: - str1 = "test1" - str2 = "test2" - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - expected = [{"test_namespace": [str1, encoded_str2]}] - assert ( - base.embed({"test_namespace": [str1, base.Embed(str2)]}, MockEncoder()) - == expected - ) + str1 = "test" + str2 = "test_" + expected_sparse = {"test_namespace_0": {"raw": str1}} + expected_dense = {"test_namespace_1": [5.0, 0.0]} + + featurized = base.embed({"test_namespace": [str1, base.Embed(str2)]}, MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_nested_list_features_throws() -> None: diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py index e52d1da..59a64e9 100644 --- a/tests/unit_tests/test_utils.py +++ b/tests/unit_tests/test_utils.py @@ -3,7 +3,7 @@ class MockEncoder: def encode(self, to_encode: str) -> str: - return "[encoded]" + to_encode + return [float(len(to_encode)), 0.0] class MockEncoderReturnsList: From 966590dfae90c01f8f9ab8041e63c0d2321e5677 Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Tue, 14 Nov 2023 23:23:48 -0500 Subject: [PATCH 5/9] no vw logic in pick_best_featurizer --- src/learn_to_pick/base.py | 15 ++++--- src/learn_to_pick/pick_best.py | 39 +++++++++++-------- tests/unit_tests/test_pick_best_call.py | 9 +++-- .../test_pick_best_text_embedder.py | 35 +++++++++-------- 4 files changed, 55 insertions(+), 43 deletions(-) diff --git a/src/learn_to_pick/base.py b/src/learn_to_pick/base.py index c5dbf80..d3a57d6 100644 --- a/src/learn_to_pick/base.py +++ b/src/learn_to_pick/base.py @@ -13,13 +13,13 @@ Type, TypeVar, Union, + Callable ) from learn_to_pick.metrics import MetricsTrackerAverage, MetricsTrackerRollingWindow from learn_to_pick.model_repository import ModelRepository from learn_to_pick.vw_logger import VwLogger from learn_to_pick.features import Featurized, DenseFeatures, SparseFeatures -import numpy as np if TYPE_CHECKING: import vowpal_wabbit_next as vw @@ -163,6 +163,7 @@ def __init__( model_repo: ModelRepository, vw_cmd: List[str], featurizer: Featurizer, + formatter: Callable, vw_logger: VwLogger, *args: Any, **kwargs: Any, @@ -172,27 +173,31 @@ def __init__( self.vw_cmd = vw_cmd self.workspace = self.model_repo.load(vw_cmd) self.featurizer = featurizer + self.formatter = formatter self.vw_logger = vw_logger + def format(self, event): + return self.formatter(*self.featurizer.featurize(event)) + def predict(self, event: TEvent) -> Any: import vowpal_wabbit_next as vw text_parser = vw.TextFormatParser(self.workspace) return self.workspace.predict_one( - _parse_lines(text_parser, self.featurizer.format(event)) + _parse_lines(text_parser, self.format(event)) ) def learn(self, event: TEvent) -> None: import vowpal_wabbit_next as vw - vw_ex = self.featurizer.format(event) + vw_ex = self.format(event) text_parser = vw.TextFormatParser(self.workspace) multi_ex = _parse_lines(text_parser, vw_ex) self.workspace.learn_one(multi_ex) def log(self, event: TEvent) -> None: if self.vw_logger.logging_enabled(): - vw_ex = self.featurizer.format(event) + vw_ex = self.format(event) self.vw_logger.log(vw_ex) def save(self) -> None: @@ -204,7 +209,7 @@ def __init__(self, *args: Any, **kwargs: Any): pass @abstractmethod - def format(self, event: TEvent) -> Any: + def featurize(self, event: TEvent) -> Any: ... diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index ebd682e..963487a 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Any, Dict, List, Optional, Tuple, Type, Union, Iterable +from typing import Any, Dict, List, Optional, Tuple, Type, Union, Callable from itertools import chain import os import numpy as np @@ -82,7 +82,7 @@ def _to_str(v): def featurized_2_str(obj: base.Featurized) -> str: return " ".join(chain.from_iterable([ map(lambda kv: f'|{kv[0]}_dense {VwTxt._dense_2_str(kv[1])}', obj.dense.items()), - map(lambda kv: f'|{kv[0]}_sparse {VwTxt._sparse_2_str(kv[1])}', obj.sparse.items())])) + map(lambda kv: f'|{kv[0]}_sparse {VwTxt._sparse_2_str(kv[1])}', obj.sparse.items())])) class PickBestFeaturizer(base.Featurizer[PickBestEvent]): @@ -137,22 +137,25 @@ def _generic_namespaces(self, context, actions): for a in actions: a['#'] = self._generic_namespace(a) - def format(self, event: PickBestEvent) -> str: - context_emb = event.context(self.model) - action_embs = event.actions(self.model) + def featurize(self, event: PickBestEvent) -> Tuple[base.Featurized, List[base.Featurized], PickBestSelected]: + context = event.context(self.model) + actions = event.actions(self.model) if self.auto_embed: - self._dotproducts(context_emb, action_embs) - self._generic_namespaces(context_emb, action_embs) + self._dotproducts(context, actions) + self._generic_namespaces(context, actions) - nactions = len(action_embs) - context_str = f"shared {VwTxt.featurized_2_str(context_emb)}" - selected = event.selected - labels = ["" for _ in range(nactions)] - if selected.score is not None: - labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " - actions_str = [f"{l}{VwTxt.featurized_2_str(a)}" for a, l in zip(action_embs, labels)] - return "\n".join([context_str] + actions_str) + return context, actions, event.selected + + +def vw_cb_formatter(context: base.Featurized, actions: List[base.Featurized], selected: PickBestSelected) -> str: + nactions = len(actions) + context_str = f"shared {VwTxt.featurized_2_str(context)}" + labels = ["" for _ in range(nactions)] + if selected.score is not None: + labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " + actions_str = [f"{l}{VwTxt.featurized_2_str(a)}" for a, l in zip(actions, labels)] + return "\n".join([context_str] + actions_str) class PickBestRandomPolicy(base.Policy[PickBestEvent]): @@ -300,13 +303,14 @@ def create( @staticmethod def create_policy( featurizer: Optional[base.Featurizer] = None, + formatter: Optional[Callable] = None, vw_cmd: Optional[List[str]] = None, model_save_dir: str = "./", reset_model: bool = False, rl_logs: Optional[Union[str, os.PathLike]] = None, ): - if not featurizer: - featurizer = PickBestFeaturizer(auto_embed=False) + featurizer = featurizer or PickBestFeaturizer(auto_embed=False) + formatter = formatter or vw_cb_formatter vw_cmd = vw_cmd or [] interactions = [] @@ -334,6 +338,7 @@ def create_policy( ), vw_cmd=vw_cmd, featurizer=featurizer, + formatter=formatter, vw_logger=base.VwLogger(rl_logs), ) diff --git a/tests/unit_tests/test_pick_best_call.py b/tests/unit_tests/test_pick_best_call.py index 43891bd..a9056e9 100644 --- a/tests/unit_tests/test_pick_best_call.py +++ b/tests/unit_tests/test_pick_best_call.py @@ -5,6 +5,7 @@ import learn_to_pick import learn_to_pick.base as rl_loop +from learn_to_pick.pick_best import vw_cb_formatter encoded_keyword = "[encoded]" @@ -179,7 +180,7 @@ def test_everything_embedded() -> None: action=rl_loop.EmbedAndKeep(learn_to_pick.ToSelectFrom(actions)), ) picked_metadata = response["picked_metadata"] # type: ignore - vw_str = featurizer.format(picked_metadata) # type: ignore + vw_str = vw_cb_formatter(*featurizer.featurize(picked_metadata)) # type: ignore assert_vw_ex_equals(vw_str, expected) @@ -205,7 +206,7 @@ def test_default_auto_embedder_is_off() -> None: action=learn_to_pick.base.ToSelectFrom(actions), ) picked_metadata = response["picked_metadata"] # type: ignore - vw_str = featurizer.format(picked_metadata) # type: ignore + vw_str = vw_cb_formatter(*featurizer.featurize(picked_metadata)) # type: ignore assert_vw_ex_equals(vw_str, expected) @@ -231,7 +232,7 @@ def test_default_w_embeddings_off() -> None: action=learn_to_pick.ToSelectFrom(actions), ) picked_metadata = response["picked_metadata"] # type: ignore - vw_str = featurizer.format(picked_metadata) # type: ignore + vw_str = vw_cb_formatter(*featurizer.featurize(picked_metadata)) # type: ignore assert_vw_ex_equals(vw_str, expected) @@ -258,7 +259,7 @@ def test_default_w_embeddings_on() -> None: action=learn_to_pick.ToSelectFrom(actions), ) picked_metadata = response["picked_metadata"] # type: ignore - vw_str = featurizer.format(picked_metadata) # type: ignore + vw_str = vw_cb_formatter(*featurizer.featurize(picked_metadata)) # type: ignore assert_vw_ex_equals(vw_str, expected) diff --git a/tests/unit_tests/test_pick_best_text_embedder.py b/tests/unit_tests/test_pick_best_text_embedder.py index 0d8b47d..feca4e8 100644 --- a/tests/unit_tests/test_pick_best_text_embedder.py +++ b/tests/unit_tests/test_pick_best_text_embedder.py @@ -3,6 +3,7 @@ import learn_to_pick.base as rl_chain import learn_to_pick.pick_best as pick_best_chain +from learn_to_pick.pick_best import vw_cb_formatter def test_pickbest_textembedder_missing_context_not_throws() -> None: @@ -13,7 +14,7 @@ def test_pickbest_textembedder_missing_context_not_throws() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_action, based_on={} ) - featurizer.format(event) + featurizer.featurize(event) def test_pickbest_textembedder_missing_actions_throws() -> None: @@ -24,7 +25,7 @@ def test_pickbest_textembedder_missing_actions_throws() -> None: inputs={}, to_select_from={}, based_on={"context": "context"} ) with pytest.raises(ValueError): - featurizer.format(event) + featurizer.featurize(event) def test_pickbest_textembedder_no_label_no_emb() -> None: @@ -41,7 +42,7 @@ def test_pickbest_textembedder_no_label_no_emb() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on={"context": "context"} ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -62,7 +63,7 @@ def test_pickbest_textembedder_w_label_no_score_no_emb() -> None: based_on={"context": "context"}, selected=selected, ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -84,7 +85,7 @@ def test_pickbest_textembedder_w_full_label_no_emb() -> None: based_on={"context": "context"}, selected=selected, ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -110,7 +111,7 @@ def test_pickbest_textembedder_w_full_label_w_emb() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -136,7 +137,7 @@ def test_pickbest_textembedder_w_full_label_w_embed_and_keep() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -154,7 +155,7 @@ def test_pickbest_textembedder_more_namespaces_no_label_no_emb() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -173,7 +174,7 @@ def test_pickbest_textembedder_more_namespaces_w_label_no_emb() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -192,7 +193,7 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_no_emb() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -225,7 +226,7 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_emb() -> None event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -262,7 +263,7 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_embed_and_kee event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -294,7 +295,7 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emb() -> N event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -328,7 +329,7 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emakeep() event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -354,7 +355,7 @@ def test_raw_features_underscored() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected_no_embed) # Just embeddings @@ -366,7 +367,7 @@ def test_raw_features_underscored() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected_embed) # Embeddings and raw features @@ -378,5 +379,5 @@ def test_raw_features_underscored() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected_embed_and_keep) From 36fd0aafffee3034b3bc8b06df2e63c690dd4a47 Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Tue, 14 Nov 2023 23:30:54 -0500 Subject: [PATCH 6/9] black --- src/learn_to_pick/base.py | 18 +- src/learn_to_pick/features.py | 13 +- src/learn_to_pick/pick_best.py | 66 +++-- tests/unit_tests/test_pick_best_call.py | 52 ++-- .../test_pick_best_text_embedder.py | 186 ++++++++------ .../unit_tests/test_rl_loop_base_embedder.py | 226 ++++++++++-------- 6 files changed, 334 insertions(+), 227 deletions(-) diff --git a/src/learn_to_pick/base.py b/src/learn_to_pick/base.py index d3a57d6..dab5e45 100644 --- a/src/learn_to_pick/base.py +++ b/src/learn_to_pick/base.py @@ -13,7 +13,7 @@ Type, TypeVar, Union, - Callable + Callable, ) from learn_to_pick.metrics import MetricsTrackerAverage, MetricsTrackerRollingWindow @@ -183,9 +183,7 @@ def predict(self, event: TEvent) -> Any: import vowpal_wabbit_next as vw text_parser = vw.TextFormatParser(self.workspace) - return self.workspace.predict_one( - _parse_lines(text_parser, self.format(event)) - ) + return self.workspace.predict_one(_parse_lines(text_parser, self.format(event))) def learn(self, event: TEvent) -> None: import vowpal_wabbit_next as vw @@ -489,18 +487,20 @@ def run(self, *args, **kwargs) -> Dict[str, Any]: def _embed_string_type( - item: Union[str, _Embed], model: Any, namespace: str) -> Featurized: + item: Union[str, _Embed], model: Any, namespace: str +) -> Featurized: """Helper function to embed a string or an _Embed object.""" import re + result = Featurized() if isinstance(item, _Embed): result[namespace] = DenseFeatures(model.encode(item.value)) if item.keep: keep_str = item.value.replace(" ", "_") - result[namespace] = {'raw': re.sub(r"[\t\n\r\f\v]+", " ", keep_str)} + result[namespace] = {"raw": re.sub(r"[\t\n\r\f\v]+", " ", keep_str)} elif isinstance(item, str): encoded = item.replace(" ", "_") - result[namespace] = {'raw': re.sub(r"[\t\n\r\f\v]+", " ", encoded)} + result[namespace] = {"raw": re.sub(r"[\t\n\r\f\v]+", " ", encoded)} else: raise ValueError(f"Unsupported type {type(item)} for embedding") @@ -513,7 +513,7 @@ def _embed_dict_type(item: Dict, model: Any) -> Featurized: for ns, embed_item in item.items(): if isinstance(embed_item, list): for idx, embed_list_item in enumerate(embed_item): - result.merge(_embed_string_type(embed_list_item, model, f'{ns}_{idx}')) + result.merge(_embed_string_type(embed_list_item, model, f"{ns}_{idx}")) else: result.merge(_embed_string_type(embed_item, model, ns)) return result @@ -529,7 +529,7 @@ def _embed_list_type( elif isinstance(embed_item, list): result.append(Featurized()) for idx, embed_list_item in enumerate(embed_item): - result[-1].merge(_embed_string_type(embed_list_item, model, f'{idx}')) + result[-1].merge(_embed_string_type(embed_list_item, model, f"{idx}")) else: result.append(_embed_string_type(embed_item, model, namespace)) return result diff --git a/src/learn_to_pick/features.py b/src/learn_to_pick/features.py index b03c808..d5ded1c 100644 --- a/src/learn_to_pick/features.py +++ b/src/learn_to_pick/features.py @@ -1,6 +1,7 @@ from typing import Union, Optional, Dict, List import numpy as np + class SparseFeatures(dict): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -12,7 +13,11 @@ def __init__(self, *args, **kwargs): class Featurized: - def __init__(self, sparse: Optional[Dict[str, SparseFeatures]] = None, dense: Optional[Dict[str, DenseFeatures]] = None): + def __init__( + self, + sparse: Optional[Dict[str, SparseFeatures]] = None, + dense: Optional[Dict[str, DenseFeatures]] = None, + ): self.sparse = sparse or {} self.dense = dense or {} @@ -22,8 +27,10 @@ def __setitem__(self, key, value): elif isinstance(value, List) or isinstance(value, np.ndarray): self.dense[key] = DenseFeatures(value) else: - raise ValueError(f'Cannot convert {type(value)} to either DenseFeatures or SparseFeatures') - + raise ValueError( + f"Cannot convert {type(value)} to either DenseFeatures or SparseFeatures" + ) + def merge(self, other): self.sparse.update(other.sparse) self.dense.update(other.dense) diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index 963487a..df8c6ac 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -30,6 +30,7 @@ def __init__( self.probability = probability self.score = score + class PickBestEvent(base.Event[PickBestSelected]): def __init__( self, @@ -65,6 +66,7 @@ def actions(self, model) -> List[base.Featurized]: ) return action_embs + class VwTxt: @staticmethod def _dense_2_str(values: base.DenseFeatures) -> str: @@ -74,15 +76,27 @@ def _dense_2_str(values: base.DenseFeatures) -> str: def _sparse_2_str(values: base.SparseFeatures) -> str: def _to_str(v): import numbers - return v if isinstance(v, numbers.Number) else f'={v}' + + return v if isinstance(v, numbers.Number) else f"={v}" return " ".join([f"{k}:{_to_str(v)}" for k, v in values.items()]) - + @staticmethod def featurized_2_str(obj: base.Featurized) -> str: - return " ".join(chain.from_iterable([ - map(lambda kv: f'|{kv[0]}_dense {VwTxt._dense_2_str(kv[1])}', obj.dense.items()), - map(lambda kv: f'|{kv[0]}_sparse {VwTxt._sparse_2_str(kv[1])}', obj.sparse.items())])) + return " ".join( + chain.from_iterable( + [ + map( + lambda kv: f"|{kv[0]}_dense {VwTxt._dense_2_str(kv[1])}", + obj.dense.items(), + ), + map( + lambda kv: f"|{kv[0]}_sparse {VwTxt._sparse_2_str(kv[1])}", + obj.sparse.items(), + ), + ] + ) + ) class PickBestFeaturizer(base.Featurizer[PickBestEvent]): @@ -109,54 +123,64 @@ def __init__( def _dotproducts(self, context, actions): _context_dense = base.Featurized() for ns in context.sparse.keys(): - if 'raw' in context.sparse[ns]: - _context_dense[ns] = self.model.encode(context.sparse[ns]['raw']) + if "raw" in context.sparse[ns]: + _context_dense[ns] = self.model.encode(context.sparse[ns]["raw"]) _actions_dense = [base.Featurized() for _ in range(len(actions))] for _action, action in zip(_actions_dense, actions): for ns in action.sparse.keys(): - if 'raw' in action.sparse[ns]: - _action[ns] = self.model.encode(action.sparse[ns]['raw']) + if "raw" in action.sparse[ns]: + _action[ns] = self.model.encode(action.sparse[ns]["raw"]) context_names = list(_context_dense.dense.keys()) context_matrix = np.stack(list(_context_dense.dense.values())) for _a, a in zip(_actions_dense, actions): action_names = list(_a.dense.keys()) product = np.dot(context_matrix, np.stack(list(_a.dense.values())).T) - a['dotprod'] = {f'{context_names[i]}_{action_names[j]}': product[i, j] for i in range(len(context_names)) for j in range(len(action_names))} + a["dotprod"] = { + f"{context_names[i]}_{action_names[j]}": product[i, j] + for i in range(len(context_names)) + for j in range(len(action_names)) + } def _generic_namespace(self, featurized): result = base.SparseFeatures() for ns in featurized.sparse.keys(): - if 'raw' in featurized.sparse[ns]: - result[ns] = featurized.sparse[ns]['raw'] + if "raw" in featurized.sparse[ns]: + result[ns] = featurized.sparse[ns]["raw"] return result def _generic_namespaces(self, context, actions): - context['@'] = self._generic_namespace(context) + context["@"] = self._generic_namespace(context) for a in actions: - a['#'] = self._generic_namespace(a) + a["#"] = self._generic_namespace(a) - def featurize(self, event: PickBestEvent) -> Tuple[base.Featurized, List[base.Featurized], PickBestSelected]: + def featurize( + self, event: PickBestEvent + ) -> Tuple[base.Featurized, List[base.Featurized], PickBestSelected]: context = event.context(self.model) actions = event.actions(self.model) if self.auto_embed: self._dotproducts(context, actions) self._generic_namespaces(context, actions) - + return context, actions, event.selected -def vw_cb_formatter(context: base.Featurized, actions: List[base.Featurized], selected: PickBestSelected) -> str: +def vw_cb_formatter( + context: base.Featurized, actions: List[base.Featurized], selected: PickBestSelected +) -> str: nactions = len(actions) context_str = f"shared {VwTxt.featurized_2_str(context)}" labels = ["" for _ in range(nactions)] if selected.score is not None: - labels[selected.index] = f"{selected.index}:{-selected.score}:{selected.probability} " + labels[ + selected.index + ] = f"{selected.index}:{-selected.score}:{selected.probability} " actions_str = [f"{l}{VwTxt.featurized_2_str(a)}" for a, l in zip(actions, labels)] return "\n".join([context_str] + actions_str) - + class PickBestRandomPolicy(base.Policy[PickBestEvent]): def __init__(self): @@ -235,7 +259,9 @@ def _call_after_predict_before_scoring( sampled_ap = prediction[sampled_index] sampled_action = sampled_ap[0] sampled_prob = sampled_ap[1] - event.selected = PickBestSelected(index=sampled_action, probability=sampled_prob) + event.selected = PickBestSelected( + index=sampled_action, probability=sampled_prob + ) next_inputs = inputs.copy() diff --git a/tests/unit_tests/test_pick_best_call.py b/tests/unit_tests/test_pick_best_call.py index a9056e9..24b374b 100644 --- a/tests/unit_tests/test_pick_best_call.py +++ b/tests/unit_tests/test_pick_best_call.py @@ -163,15 +163,18 @@ def test_everything_embedded() -> None: str2 = "1" str3 = "2" action_dense = "0:1.0 1:0.0" - + ctx_str_1 = "context1" encoded_ctx_str_1 = "0:8.0 1:0.0" - expected = "\n".join([ - f"shared |User_dense {encoded_ctx_str_1} |User_sparse raw:={ctx_str_1}", - f"|action_dense {action_dense} |action_sparse raw:={str1}", - f"|action_dense {action_dense} |action_sparse raw:={str2}", - f"|action_dense {action_dense} |action_sparse raw:={str3}"]) # noqa + expected = "\n".join( + [ + f"shared |User_dense {encoded_ctx_str_1} |User_sparse raw:={ctx_str_1}", + f"|action_dense {action_dense} |action_sparse raw:={str1}", + f"|action_dense {action_dense} |action_sparse raw:={str2}", + f"|action_dense {action_dense} |action_sparse raw:={str3}", + ] + ) # noqa actions = [str1, str2, str3] @@ -193,11 +196,14 @@ def test_default_auto_embedder_is_off() -> None: str3 = "2" ctx_str_1 = "context1" - expected = "\n".join([ - f"shared |User_sparse raw:={ctx_str_1}", - f"|action_sparse raw:={str1}", - f"|action_sparse raw:={str2}", - f"|action_sparse raw:={str3}"]) # noqa + expected = "\n".join( + [ + f"shared |User_sparse raw:={ctx_str_1}", + f"|action_sparse raw:={str1}", + f"|action_sparse raw:={str2}", + f"|action_sparse raw:={str3}", + ] + ) # noqa actions = [str1, str2, str3] @@ -219,11 +225,14 @@ def test_default_w_embeddings_off() -> None: str3 = "2" ctx_str_1 = "context1" - expected = "\n".join([ - f"shared |User_sparse raw:={ctx_str_1}", - f"|action_sparse raw:={str1}", - f"|action_sparse raw:={str2}", - f"|action_sparse raw:={str3}"]) # noqa + expected = "\n".join( + [ + f"shared |User_sparse raw:={ctx_str_1}", + f"|action_sparse raw:={str1}", + f"|action_sparse raw:={str2}", + f"|action_sparse raw:={str3}", + ] + ) # noqa actions = [str1, str2, str3] @@ -247,10 +256,13 @@ def test_default_w_embeddings_on() -> None: ctx_str_1 = "context1" dot_prod = "dotprod_sparse User_action:5.0" # dot prod of [1.0, 2.0] and [1.0, 2.0] - expected = "\n".join([ - f"shared |User_sparse raw:={ctx_str_1} |@_sparse User:={ctx_str_1}", - f"|action_sparse raw:={str1} |{dot_prod} |#_sparse action:={str1} ", - f"|action_sparse raw:={str2} |{dot_prod} |#_sparse action:={str2} "]) # noqa + expected = "\n".join( + [ + f"shared |User_sparse raw:={ctx_str_1} |@_sparse User:={ctx_str_1}", + f"|action_sparse raw:={str1} |{dot_prod} |#_sparse action:={str1} ", + f"|action_sparse raw:={str2} |{dot_prod} |#_sparse action:={str2} ", + ] + ) # noqa actions = [str1, str2] diff --git a/tests/unit_tests/test_pick_best_text_embedder.py b/tests/unit_tests/test_pick_best_text_embedder.py index feca4e8..414341a 100644 --- a/tests/unit_tests/test_pick_best_text_embedder.py +++ b/tests/unit_tests/test_pick_best_text_embedder.py @@ -33,12 +33,15 @@ def test_pickbest_textembedder_no_label_no_emb() -> None: auto_embed=False, model=MockEncoder() ) named_actions = {"action": ["0", "1", "2"]} - expected = "\n".join([ - "shared |context_sparse raw:=context", - "|action_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2"]) - + expected = "\n".join( + [ + "shared |context_sparse raw:=context", + "|action_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2", + ] + ) + event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on={"context": "context"} ) @@ -51,11 +54,14 @@ def test_pickbest_textembedder_w_label_no_score_no_emb() -> None: auto_embed=False, model=MockEncoder() ) named_actions = {"action": ["0", "1", "2"]} - expected = "\n".join([ - "shared |context_sparse raw:=context", - "|action_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2"]) + expected = "\n".join( + [ + "shared |context_sparse raw:=context", + "|action_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2", + ] + ) selected = pick_best_chain.PickBestSelected(index=0, probability=1.0) event = pick_best_chain.PickBestEvent( inputs={}, @@ -72,11 +78,14 @@ def test_pickbest_textembedder_w_full_label_no_emb() -> None: auto_embed=False, model=MockEncoder() ) named_actions = {"action": ["0", "1", "2"]} - expected = "\n".join([ - "shared |context_sparse raw:=context", - "0:-0.0:1.0 |action_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2"]) + expected = "\n".join( + [ + "shared |context_sparse raw:=context", + "0:-0.0:1.0 |action_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2", + ] + ) selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( @@ -102,11 +111,14 @@ def test_pickbest_textembedder_w_full_label_w_emb() -> None: named_actions = {"action": rl_chain.Embed([str1, str2, str3])} context = {"context": rl_chain.Embed(ctx_str)} - expected = "\n".join([ - f"shared |context_dense {encoded_ctx_str}", - "0:-0.0:1.0 |action_dense 0:1.0 1:0.0", - "|action_dense 0:1.0 1:0.0", - "|action_dense 0:1.0 1:0.0"]) # noqa: E501 + expected = "\n".join( + [ + f"shared |context_dense {encoded_ctx_str}", + "0:-0.0:1.0 |action_dense 0:1.0 1:0.0", + "|action_dense 0:1.0 1:0.0", + "|action_dense 0:1.0 1:0.0", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -128,11 +140,14 @@ def test_pickbest_textembedder_w_full_label_w_embed_and_keep() -> None: named_actions = {"action": rl_chain.EmbedAndKeep([str1, str2, str3])} context = {"context": rl_chain.EmbedAndKeep(ctx_str)} - expected = "\n".join([ - f"shared |context_dense {encoded_ctx_str} |context_sparse raw:={ctx_str}", - "0:-0.0:1.0 |action_dense 0:1.0 1:0.0 |action_sparse raw:=0", - "|action_dense 0:1.0 1:0.0 |action_sparse raw:=1", - "|action_dense 0:1.0 1:0.0 |action_sparse raw:=2"]) # noqa: E501 + expected = "\n".join( + [ + f"shared |context_dense {encoded_ctx_str} |context_sparse raw:={ctx_str}", + "0:-0.0:1.0 |action_dense 0:1.0 1:0.0 |action_sparse raw:=0", + "|action_dense 0:1.0 1:0.0 |action_sparse raw:=1", + "|action_dense 0:1.0 1:0.0 |action_sparse raw:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -147,11 +162,14 @@ def test_pickbest_textembedder_more_namespaces_no_label_no_emb() -> None: ) named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = "\n".join([ - "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2 ", - "|a_sparse raw:=0 |b_sparse raw:=0", - "|action1_sparse raw:=1", - "|action1_sparse raw:=2"]) # noqa: E501 + expected = "\n".join( + [ + "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2 ", + "|a_sparse raw:=0 |b_sparse raw:=0", + "|action1_sparse raw:=1", + "|action1_sparse raw:=2", + ] + ) # noqa: E501 event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) @@ -165,11 +183,14 @@ def test_pickbest_textembedder_more_namespaces_w_label_no_emb() -> None: ) named_actions = {"action": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = "\n".join([ - "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2", - "|a_sparse raw:=0 |b_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2"]) # noqa: E501 + expected = "\n".join( + [ + "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2", + "|a_sparse raw:=0 |b_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -184,11 +205,14 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_no_emb() -> None: ) named_actions = {"action": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = "\n".join([ - "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2", - "0:-0.0:1.0 |a_sparse raw:=0 |b_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2"]) # noqa: E501 + expected = "\n".join( + [ + "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2", + "0:-0.0:1.0 |a_sparse raw:=0 |b_sparse raw:=0", + "|action_sparse raw:=1", + "|action_sparse raw:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -216,11 +240,14 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_emb() -> None "context1": rl_chain.Embed(ctx_str_1), "context2": rl_chain.Embed(ctx_str_2), } - expected = "\n".join([ - f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2}", - f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0", - f"|action_dense 0:1.0 1:0.0", - f"|action_dense 0:1.0 1:0.0"]) # noqa: E501 + expected = "\n".join( + [ + f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2}", + f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0", + f"|action_dense 0:1.0 1:0.0", + f"|action_dense 0:1.0 1:0.0", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( @@ -253,12 +280,15 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_embed_and_kee "context1": rl_chain.EmbedAndKeep(ctx_str_1), "context2": rl_chain.EmbedAndKeep(ctx_str_2), } - expected = "\n".join([ - f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1} |context2_sparse raw:={ctx_str_2}", - f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0 |b_sparse raw:=0", - f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=1", - f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=2"]) # noqa: E501 - + expected = "\n".join( + [ + f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1} |context2_sparse raw:={ctx_str_2}", + f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0 |b_sparse raw:=0", + f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=1", + f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=2", + ] + ) # noqa: E501 + selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -285,11 +315,14 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emb() -> N } context = {"context1": ctx_str_1, "context2": rl_chain.Embed(ctx_str_2)} - expected = "\n".join([ - f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1}", - f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0", - f"|action_sparse raw:=1", - f"|action_dense 0:1.0 1:0.0"]) # noqa: E501 + expected = "\n".join( + [ + f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1}", + f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0", + f"|action_sparse raw:=1", + f"|action_dense 0:1.0 1:0.0", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( @@ -320,11 +353,14 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emakeep() ] } context = {"context1": ctx_str_1, "context2": rl_chain.EmbedAndKeep(ctx_str_2)} - expected = "\n".join([ - f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1} |context2_sparse raw:={ctx_str_2}", - f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0 |b_sparse raw:=0", - f"|action_sparse raw:=1", - f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=2"]) # noqa: E501 + expected = "\n".join( + [ + f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1} |context2_sparse raw:={ctx_str_2}", + f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0 |b_sparse raw:=0", + f"|action_sparse raw:=1", + f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected @@ -348,10 +384,13 @@ def test_raw_features_underscored() -> None: # No embeddings named_actions = {"action": [str1]} context = {"context": ctx_str} - expected_no_embed = "\n".join([ - f"shared |context_sparse raw:={ctx_str_underscored}", - f"|action_sparse raw:={str1_underscored}"]) - + expected_no_embed = "\n".join( + [ + f"shared |context_sparse raw:={ctx_str_underscored}", + f"|action_sparse raw:={str1_underscored}", + ] + ) + event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) @@ -361,9 +400,9 @@ def test_raw_features_underscored() -> None: # Just embeddings named_actions = {"action": rl_chain.Embed([str1])} context = {"context": rl_chain.Embed(ctx_str)} - expected_embed = "\n".join([ - f"shared |context_dense {encoded_ctx_str}", - f"|action_dense {encoded_str1}"]) + expected_embed = "\n".join( + [f"shared |context_dense {encoded_ctx_str}", f"|action_dense {encoded_str1}"] + ) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) @@ -373,9 +412,12 @@ def test_raw_features_underscored() -> None: # Embeddings and raw features named_actions = {"action": rl_chain.EmbedAndKeep([str1])} context = {"context": rl_chain.EmbedAndKeep(ctx_str)} - expected_embed_and_keep = "\n".join([ - f"shared |context_dense {encoded_ctx_str} |context_sparse raw:={ctx_str_underscored}", - f"|action_dense {encoded_str1} |action_sparse raw:={str1_underscored}"]) # noqa: E501 + expected_embed_and_keep = "\n".join( + [ + f"shared |context_dense {encoded_ctx_str} |context_sparse raw:={ctx_str_underscored}", + f"|action_dense {encoded_str1} |action_sparse raw:={str1_underscored}", + ] + ) # noqa: E501 event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) diff --git a/tests/unit_tests/test_rl_loop_base_embedder.py b/tests/unit_tests/test_rl_loop_base_embedder.py index c93f2df..544d8c5 100644 --- a/tests/unit_tests/test_rl_loop_base_embedder.py +++ b/tests/unit_tests/test_rl_loop_base_embedder.py @@ -34,11 +34,15 @@ def test_simple_context_str_w_nested_emb() -> None: expected_dense = {"a_namespace": [4.0, 0.0]} expected_sparse = {"a_namespace": {"raw": str1}} - featurized = base.embed(base.EmbedAndKeep(base.Embed(str1)), MockEncoder(), "a_namespace") + featurized = base.embed( + base.EmbedAndKeep(base.Embed(str1)), MockEncoder(), "a_namespace" + ) assert featurized.dense == expected_dense assert featurized.sparse == {} - featurized = base.embed(base.Embed(base.EmbedAndKeep(str1)), MockEncoder(), "a_namespace") + featurized = base.embed( + base.Embed(base.EmbedAndKeep(str1)), MockEncoder(), "a_namespace" + ) assert featurized.sparse == expected_sparse assert featurized.dense == expected_dense @@ -49,6 +53,7 @@ def test_context_w_namespace_no_emb() -> None: assert featurized.sparse == expected_sparse assert featurized.dense == {} + def test_context_w_namespace_w_emb() -> None: str1 = "test" expected_sparse = {"test_namespace": {"raw": str1}} @@ -67,7 +72,7 @@ def test_context_w_namespace_w_emb2() -> None: str1 = "test" expected_sparse = {"test_namespace": {"raw": str1}} expected_dense = {"test_namespace": [4.0, 0.0]} - + featurized = base.embed(base.Embed({"test_namespace": str1}), MockEncoder()) assert featurized.sparse == {} assert featurized.dense == expected_dense @@ -83,16 +88,19 @@ def test_context_w_namespace_w_some_emb() -> None: expected_sparse = {"test_namespace": {"raw": str1}} expected_dense = {"test_namespace2": [5.0, 0.0]} featurized = base.embed( - {"test_namespace": str1, "test_namespace2": base.Embed(str2)}, MockEncoder() - ) + {"test_namespace": str1, "test_namespace2": base.Embed(str2)}, MockEncoder() + ) assert featurized.sparse == expected_sparse assert featurized.dense == expected_dense - expected_sparse = {"test_namespace": {"raw": str1}, "test_namespace2": {"raw": str2}} + expected_sparse = { + "test_namespace": {"raw": str1}, + "test_namespace2": {"raw": str2}, + } featurized = base.embed( - {"test_namespace": str1, "test_namespace2": base.EmbedAndKeep(str2)}, - MockEncoder(), - ) + {"test_namespace": str1, "test_namespace2": base.EmbedAndKeep(str2)}, + MockEncoder(), + ) assert featurized.sparse == expected_sparse assert featurized.dense == expected_dense @@ -104,7 +112,8 @@ def test_simple_action_strlist_no_emb() -> None: expected_sparse = [ {"a_namespace": {"raw": str1}}, {"a_namespace": {"raw": str2}}, - {"a_namespace": {"raw": str3}}] + {"a_namespace": {"raw": str3}}, + ] to_embed: List[Union[str, base._Embed]] = [str1, str2, str3] featurized = base.embed(to_embed, MockEncoder(), "a_namespace") @@ -121,18 +130,24 @@ def test_simple_action_strlist_w_emb() -> None: expected_sparse = [ {"a_namespace": {"raw": str1}}, {"a_namespace": {"raw": str2}}, - {"a_namespace": {"raw": str3}}] + {"a_namespace": {"raw": str3}}, + ] expected_dense = [ {"a_namespace": [4.0, 0.0]}, {"a_namespace": [5.0, 0.0]}, - {"a_namespace": [6.0, 0.0]}] - - featurized = base.embed(base.Embed([str1, str2, str3]), MockEncoder(), "a_namespace") + {"a_namespace": [6.0, 0.0]}, + ] + + featurized = base.embed( + base.Embed([str1, str2, str3]), MockEncoder(), "a_namespace" + ) for i in range(len(featurized)): assert featurized[i].sparse == {} assert featurized[i].dense == expected_dense[i] - featurized = base.embed(base.EmbedAndKeep([str1, str2, str3]), MockEncoder(), "a_namespace") + featurized = base.embed( + base.EmbedAndKeep([str1, str2, str3]), MockEncoder(), "a_namespace" + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] @@ -143,24 +158,25 @@ def test_simple_action_strlist_w_some_emb() -> None: str2 = "test_" str3 = "test__" - expected_sparse = [ - {"a_namespace": {"raw": str1}}, - {}, - {}] - expected_dense = [ - {}, - {"a_namespace": [5.0, 0.0]}, - {"a_namespace": [6.0, 0.0]}] - featurized = base.embed([str1, base.Embed(str2), base.Embed(str3)], MockEncoder(), "a_namespace") + expected_sparse = [{"a_namespace": {"raw": str1}}, {}, {}] + expected_dense = [{}, {"a_namespace": [5.0, 0.0]}, {"a_namespace": [6.0, 0.0]}] + featurized = base.embed( + [str1, base.Embed(str2), base.Embed(str3)], MockEncoder(), "a_namespace" + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] - featurized = base.embed([str1, base.EmbedAndKeep(str2), base.EmbedAndKeep(str3)], MockEncoder(), "a_namespace") + featurized = base.embed( + [str1, base.EmbedAndKeep(str2), base.EmbedAndKeep(str3)], + MockEncoder(), + "a_namespace", + ) expected_sparse = [ {"a_namespace": {"raw": str1}}, {"a_namespace": {"raw": str2}}, - {"a_namespace": {"raw": str3}}] + {"a_namespace": {"raw": str3}}, + ] for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] @@ -177,13 +193,13 @@ def test_action_w_namespace_no_emb() -> None: ] featurized = base.embed( - [ - {"test_namespace": str1}, - {"test_namespace": str2}, - {"test_namespace": str3}, - ], - MockEncoder(), - ) + [ + {"test_namespace": str1}, + {"test_namespace": str2}, + {"test_namespace": str3}, + ], + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == {} @@ -201,34 +217,34 @@ def test_action_w_namespace_w_emb() -> None: expected_dense = [ {"test_namespace": [4.0, 0.0]}, {"test_namespace": [5.0, 0.0]}, - {"test_namespace": [6.0, 0.0]}] + {"test_namespace": [6.0, 0.0]}, + ] featurized = base.embed( - [ - {"test_namespace": base.Embed(str1)}, - {"test_namespace": base.Embed(str2)}, - {"test_namespace": base.Embed(str3)}, - ], - MockEncoder(), - ) + [ + {"test_namespace": base.Embed(str1)}, + {"test_namespace": base.Embed(str2)}, + {"test_namespace": base.Embed(str3)}, + ], + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == {} assert featurized[i].dense == expected_dense[i] featurized = base.embed( - [ - {"test_namespace": base.EmbedAndKeep(str1)}, - {"test_namespace": base.EmbedAndKeep(str2)}, - {"test_namespace": base.EmbedAndKeep(str3)}, - ], - MockEncoder(), - ) + [ + {"test_namespace": base.EmbedAndKeep(str1)}, + {"test_namespace": base.EmbedAndKeep(str2)}, + {"test_namespace": base.EmbedAndKeep(str3)}, + ], + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] - def test_action_w_namespace_w_emb2() -> None: str1 = "test" str2 = "test_" @@ -241,32 +257,33 @@ def test_action_w_namespace_w_emb2() -> None: expected_dense = [ {"test_namespace1": [4.0, 0.0]}, {"test_namespace2": [5.0, 0.0]}, - {"test_namespace3": [6.0, 0.0]}] - + {"test_namespace3": [6.0, 0.0]}, + ] + featurized = base.embed( - base.Embed( - [ - {"test_namespace1": str1}, - {"test_namespace2": str2}, - {"test_namespace3": str3}, - ] - ), - MockEncoder(), - ) + base.Embed( + [ + {"test_namespace1": str1}, + {"test_namespace2": str2}, + {"test_namespace3": str3}, + ] + ), + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == {} assert featurized[i].dense == expected_dense[i] featurized = base.embed( - base.EmbedAndKeep( - [ - {"test_namespace1": str1}, - {"test_namespace2": str2}, - {"test_namespace3": str3}, - ] - ), - MockEncoder(), - ) + base.EmbedAndKeep( + [ + {"test_namespace1": str1}, + {"test_namespace2": str2}, + {"test_namespace3": str3}, + ] + ), + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] @@ -284,33 +301,34 @@ def test_action_w_namespace_w_some_emb() -> None: expected_dense = [ {}, {"test_namespace": [5.0, 0.0]}, - {"test_namespace": [6.0, 0.0]}] - + {"test_namespace": [6.0, 0.0]}, + ] + featurized = base.embed( - [ - {"test_namespace": str1}, - {"test_namespace": base.Embed(str2)}, - {"test_namespace": base.Embed(str3)}, - ], - MockEncoder(), - ) + [ + {"test_namespace": str1}, + {"test_namespace": base.Embed(str2)}, + {"test_namespace": base.Embed(str3)}, + ], + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] - + expected_sparse = [ {"test_namespace": {"raw": str1}}, {"test_namespace": {"raw": str2}}, {"test_namespace": {"raw": str3}}, ] featurized = base.embed( - [ - {"test_namespace": str1}, - {"test_namespace": base.EmbedAndKeep(str2)}, - {"test_namespace": base.EmbedAndKeep(str3)}, - ], - MockEncoder(), - ) + [ + {"test_namespace": str1}, + {"test_namespace": base.EmbedAndKeep(str2)}, + {"test_namespace": base.EmbedAndKeep(str3)}, + ], + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] @@ -328,16 +346,17 @@ def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: expected_dense = [ {"test_namespace": [4.0, 0.0]}, {"test_namespace": [5.0, 0.0]}, - {"test_namespace": [6.0, 0.0]}] - + {"test_namespace": [6.0, 0.0]}, + ] + featurized = base.embed( - [ - {"test_namespace": base.Embed(str1), "test_namespace2": str1}, - {"test_namespace": base.Embed(str2), "test_namespace2": str2}, - {"test_namespace": base.Embed(str3), "test_namespace2": str3}, - ], - MockEncoder(), - ) + [ + {"test_namespace": base.Embed(str1), "test_namespace2": str1}, + {"test_namespace": base.Embed(str2), "test_namespace2": str2}, + {"test_namespace": base.Embed(str3), "test_namespace2": str3}, + ], + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] @@ -348,13 +367,13 @@ def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: {"test_namespace": {"raw": str3}, "test_namespace2": {"raw": str3}}, ] featurized = base.embed( - [ - {"test_namespace": base.EmbedAndKeep(str1), "test_namespace2": str1}, - {"test_namespace": base.EmbedAndKeep(str2), "test_namespace2": str2}, - {"test_namespace": base.EmbedAndKeep(str3), "test_namespace2": str3}, - ], - MockEncoder(), - ) + [ + {"test_namespace": base.EmbedAndKeep(str1), "test_namespace2": str1}, + {"test_namespace": base.EmbedAndKeep(str2), "test_namespace2": str2}, + {"test_namespace": base.EmbedAndKeep(str3), "test_namespace2": str3}, + ], + MockEncoder(), + ) for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] assert featurized[i].dense == expected_dense[i] @@ -365,7 +384,8 @@ def test_one_namespace_w_list_of_features_no_emb() -> None: str2 = "test2" expected_sparse = { "test_namespace_0": {"raw": str1}, - "test_namespace_1": {"raw": str2}} + "test_namespace_1": {"raw": str2}, + } featurized = base.embed({"test_namespace": [str1, str2]}, MockEncoder()) assert featurized.sparse == expected_sparse From c333c0ab566afa471ba897a911e0aea982c8146f Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Wed, 15 Nov 2023 01:12:35 -0500 Subject: [PATCH 7/9] raw -> default_ft --- src/learn_to_pick/base.py | 4 +- src/learn_to_pick/pick_best.py | 12 +-- tests/unit_tests/test_pick_best_call.py | 30 +++---- .../test_pick_best_text_embedder.py | 86 +++++++++---------- .../unit_tests/test_rl_loop_base_embedder.py | 82 +++++++++--------- 5 files changed, 107 insertions(+), 107 deletions(-) diff --git a/src/learn_to_pick/base.py b/src/learn_to_pick/base.py index dab5e45..a449815 100644 --- a/src/learn_to_pick/base.py +++ b/src/learn_to_pick/base.py @@ -497,10 +497,10 @@ def _embed_string_type( result[namespace] = DenseFeatures(model.encode(item.value)) if item.keep: keep_str = item.value.replace(" ", "_") - result[namespace] = {"raw": re.sub(r"[\t\n\r\f\v]+", " ", keep_str)} + result[namespace] = {"default_ft": re.sub(r"[\t\n\r\f\v]+", " ", keep_str)} elif isinstance(item, str): encoded = item.replace(" ", "_") - result[namespace] = {"raw": re.sub(r"[\t\n\r\f\v]+", " ", encoded)} + result[namespace] = {"default_ft": re.sub(r"[\t\n\r\f\v]+", " ", encoded)} else: raise ValueError(f"Unsupported type {type(item)} for embedding") diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index df8c6ac..1682010 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -123,14 +123,14 @@ def __init__( def _dotproducts(self, context, actions): _context_dense = base.Featurized() for ns in context.sparse.keys(): - if "raw" in context.sparse[ns]: - _context_dense[ns] = self.model.encode(context.sparse[ns]["raw"]) + if "default_ft" in context.sparse[ns]: + _context_dense[ns] = self.model.encode(context.sparse[ns]["default_ft"]) _actions_dense = [base.Featurized() for _ in range(len(actions))] for _action, action in zip(_actions_dense, actions): for ns in action.sparse.keys(): - if "raw" in action.sparse[ns]: - _action[ns] = self.model.encode(action.sparse[ns]["raw"]) + if "default_ft" in action.sparse[ns]: + _action[ns] = self.model.encode(action.sparse[ns]["default_ft"]) context_names = list(_context_dense.dense.keys()) context_matrix = np.stack(list(_context_dense.dense.values())) @@ -146,8 +146,8 @@ def _dotproducts(self, context, actions): def _generic_namespace(self, featurized): result = base.SparseFeatures() for ns in featurized.sparse.keys(): - if "raw" in featurized.sparse[ns]: - result[ns] = featurized.sparse[ns]["raw"] + if "default_ft" in featurized.sparse[ns]: + result[ns] = featurized.sparse[ns]["default_ft"] return result def _generic_namespaces(self, context, actions): diff --git a/tests/unit_tests/test_pick_best_call.py b/tests/unit_tests/test_pick_best_call.py index 24b374b..d35d4b2 100644 --- a/tests/unit_tests/test_pick_best_call.py +++ b/tests/unit_tests/test_pick_best_call.py @@ -169,10 +169,10 @@ def test_everything_embedded() -> None: expected = "\n".join( [ - f"shared |User_dense {encoded_ctx_str_1} |User_sparse raw:={ctx_str_1}", - f"|action_dense {action_dense} |action_sparse raw:={str1}", - f"|action_dense {action_dense} |action_sparse raw:={str2}", - f"|action_dense {action_dense} |action_sparse raw:={str3}", + f"shared |User_dense {encoded_ctx_str_1} |User_sparse default_ft:={ctx_str_1}", + f"|action_dense {action_dense} |action_sparse default_ft:={str1}", + f"|action_dense {action_dense} |action_sparse default_ft:={str2}", + f"|action_dense {action_dense} |action_sparse default_ft:={str3}", ] ) # noqa @@ -198,10 +198,10 @@ def test_default_auto_embedder_is_off() -> None: expected = "\n".join( [ - f"shared |User_sparse raw:={ctx_str_1}", - f"|action_sparse raw:={str1}", - f"|action_sparse raw:={str2}", - f"|action_sparse raw:={str3}", + f"shared |User_sparse default_ft:={ctx_str_1}", + f"|action_sparse default_ft:={str1}", + f"|action_sparse default_ft:={str2}", + f"|action_sparse default_ft:={str3}", ] ) # noqa @@ -227,10 +227,10 @@ def test_default_w_embeddings_off() -> None: expected = "\n".join( [ - f"shared |User_sparse raw:={ctx_str_1}", - f"|action_sparse raw:={str1}", - f"|action_sparse raw:={str2}", - f"|action_sparse raw:={str3}", + f"shared |User_sparse default_ft:={ctx_str_1}", + f"|action_sparse default_ft:={str1}", + f"|action_sparse default_ft:={str2}", + f"|action_sparse default_ft:={str3}", ] ) # noqa @@ -258,9 +258,9 @@ def test_default_w_embeddings_on() -> None: expected = "\n".join( [ - f"shared |User_sparse raw:={ctx_str_1} |@_sparse User:={ctx_str_1}", - f"|action_sparse raw:={str1} |{dot_prod} |#_sparse action:={str1} ", - f"|action_sparse raw:={str2} |{dot_prod} |#_sparse action:={str2} ", + f"shared |User_sparse default_ft:={ctx_str_1} |@_sparse User:={ctx_str_1}", + f"|action_sparse default_ft:={str1} |{dot_prod} |#_sparse action:={str1} ", + f"|action_sparse default_ft:={str2} |{dot_prod} |#_sparse action:={str2} ", ] ) # noqa diff --git a/tests/unit_tests/test_pick_best_text_embedder.py b/tests/unit_tests/test_pick_best_text_embedder.py index 414341a..b5aafd8 100644 --- a/tests/unit_tests/test_pick_best_text_embedder.py +++ b/tests/unit_tests/test_pick_best_text_embedder.py @@ -35,10 +35,10 @@ def test_pickbest_textembedder_no_label_no_emb() -> None: named_actions = {"action": ["0", "1", "2"]} expected = "\n".join( [ - "shared |context_sparse raw:=context", - "|action_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2", + "shared |context_sparse default_ft:=context", + "|action_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", ] ) @@ -56,10 +56,10 @@ def test_pickbest_textembedder_w_label_no_score_no_emb() -> None: named_actions = {"action": ["0", "1", "2"]} expected = "\n".join( [ - "shared |context_sparse raw:=context", - "|action_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2", + "shared |context_sparse default_ft:=context", + "|action_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", ] ) selected = pick_best_chain.PickBestSelected(index=0, probability=1.0) @@ -80,10 +80,10 @@ def test_pickbest_textembedder_w_full_label_no_emb() -> None: named_actions = {"action": ["0", "1", "2"]} expected = "\n".join( [ - "shared |context_sparse raw:=context", - "0:-0.0:1.0 |action_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2", + "shared |context_sparse default_ft:=context", + "0:-0.0:1.0 |action_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", ] ) @@ -142,10 +142,10 @@ def test_pickbest_textembedder_w_full_label_w_embed_and_keep() -> None: context = {"context": rl_chain.EmbedAndKeep(ctx_str)} expected = "\n".join( [ - f"shared |context_dense {encoded_ctx_str} |context_sparse raw:={ctx_str}", - "0:-0.0:1.0 |action_dense 0:1.0 1:0.0 |action_sparse raw:=0", - "|action_dense 0:1.0 1:0.0 |action_sparse raw:=1", - "|action_dense 0:1.0 1:0.0 |action_sparse raw:=2", + f"shared |context_dense {encoded_ctx_str} |context_sparse default_ft:={ctx_str}", + "0:-0.0:1.0 |action_dense 0:1.0 1:0.0 |action_sparse default_ft:=0", + "|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=1", + "|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=2", ] ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) @@ -164,10 +164,10 @@ def test_pickbest_textembedder_more_namespaces_no_label_no_emb() -> None: context = {"context1": "context1", "context2": "context2"} expected = "\n".join( [ - "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2 ", - "|a_sparse raw:=0 |b_sparse raw:=0", - "|action1_sparse raw:=1", - "|action1_sparse raw:=2", + "shared |context1_sparse default_ft:=context1 |context2_sparse default_ft:=context2 ", + "|a_sparse default_ft:=0 |b_sparse default_ft:=0", + "|action1_sparse default_ft:=1", + "|action1_sparse default_ft:=2", ] ) # noqa: E501 event = pick_best_chain.PickBestEvent( @@ -185,10 +185,10 @@ def test_pickbest_textembedder_more_namespaces_w_label_no_emb() -> None: context = {"context1": "context1", "context2": "context2"} expected = "\n".join( [ - "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2", - "|a_sparse raw:=0 |b_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2", + "shared |context1_sparse default_ft:=context1 |context2_sparse default_ft:=context2", + "|a_sparse default_ft:=0 |b_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", ] ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0) @@ -207,10 +207,10 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_no_emb() -> None: context = {"context1": "context1", "context2": "context2"} expected = "\n".join( [ - "shared |context1_sparse raw:=context1 |context2_sparse raw:=context2", - "0:-0.0:1.0 |a_sparse raw:=0 |b_sparse raw:=0", - "|action_sparse raw:=1", - "|action_sparse raw:=2", + "shared |context1_sparse default_ft:=context1 |context2_sparse default_ft:=context2", + "0:-0.0:1.0 |a_sparse default_ft:=0 |b_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", ] ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) @@ -282,10 +282,10 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_embed_and_kee } expected = "\n".join( [ - f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1} |context2_sparse raw:={ctx_str_2}", - f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0 |b_sparse raw:=0", - f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=1", - f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=2", + f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2} |context1_sparse default_ft:={ctx_str_1} |context2_sparse default_ft:={ctx_str_2}", + f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0 |a_sparse default_ft:=0 |b_sparse default_ft:=0", + f"|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=1", + f"|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=2", ] ) # noqa: E501 @@ -317,9 +317,9 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emb() -> N expected = "\n".join( [ - f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1}", - f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0", - f"|action_sparse raw:=1", + f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse default_ft:={ctx_str_1}", + f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse default_ft:=0", + f"|action_sparse default_ft:=1", f"|action_dense 0:1.0 1:0.0", ] ) # noqa: E501 @@ -355,10 +355,10 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emakeep() context = {"context1": ctx_str_1, "context2": rl_chain.EmbedAndKeep(ctx_str_2)} expected = "\n".join( [ - f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse raw:={ctx_str_1} |context2_sparse raw:={ctx_str_2}", - f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse raw:=0 |b_sparse raw:=0", - f"|action_sparse raw:=1", - f"|action_dense 0:1.0 1:0.0 |action_sparse raw:=2", + f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse default_ft:={ctx_str_1} |context2_sparse default_ft:={ctx_str_2}", + f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse default_ft:=0 |b_sparse default_ft:=0", + f"|action_sparse default_ft:=1", + f"|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=2", ] ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) @@ -386,8 +386,8 @@ def test_raw_features_underscored() -> None: context = {"context": ctx_str} expected_no_embed = "\n".join( [ - f"shared |context_sparse raw:={ctx_str_underscored}", - f"|action_sparse raw:={str1_underscored}", + f"shared |context_sparse default_ft:={ctx_str_underscored}", + f"|action_sparse default_ft:={str1_underscored}", ] ) @@ -414,8 +414,8 @@ def test_raw_features_underscored() -> None: context = {"context": rl_chain.EmbedAndKeep(ctx_str)} expected_embed_and_keep = "\n".join( [ - f"shared |context_dense {encoded_ctx_str} |context_sparse raw:={ctx_str_underscored}", - f"|action_dense {encoded_str1} |action_sparse raw:={str1_underscored}", + f"shared |context_dense {encoded_ctx_str} |context_sparse default_ft:={ctx_str_underscored}", + f"|action_dense {encoded_str1} |action_sparse default_ft:={str1_underscored}", ] ) # noqa: E501 event = pick_best_chain.PickBestEvent( diff --git a/tests/unit_tests/test_rl_loop_base_embedder.py b/tests/unit_tests/test_rl_loop_base_embedder.py index 544d8c5..2bf2bf0 100644 --- a/tests/unit_tests/test_rl_loop_base_embedder.py +++ b/tests/unit_tests/test_rl_loop_base_embedder.py @@ -7,7 +7,7 @@ def test_simple_context_str_no_emb() -> None: - expected = {"a_namespace": {"raw": "test"}} + expected = {"a_namespace": {"default_ft": "test"}} featurized = base.embed("test", MockEncoder(), "a_namespace") assert featurized.sparse == expected @@ -17,7 +17,7 @@ def test_simple_context_str_no_emb() -> None: def test_simple_context_str_w_emb() -> None: str1 = "test" expected_dense = {"a_namespace": [4.0, 0.0]} - expected_sparse = {"a_namespace": {"raw": str1}} + expected_sparse = {"a_namespace": {"default_ft": str1}} featurized = base.embed(base.Embed(str1), MockEncoder(), "a_namespace") assert featurized.dense == expected_dense @@ -32,7 +32,7 @@ def test_simple_context_str_w_nested_emb() -> None: # nested embeddings, innermost wins str1 = "test" expected_dense = {"a_namespace": [4.0, 0.0]} - expected_sparse = {"a_namespace": {"raw": str1}} + expected_sparse = {"a_namespace": {"default_ft": str1}} featurized = base.embed( base.EmbedAndKeep(base.Embed(str1)), MockEncoder(), "a_namespace" @@ -48,7 +48,7 @@ def test_simple_context_str_w_nested_emb() -> None: def test_context_w_namespace_no_emb() -> None: - expected_sparse = {"test_namespace": {"raw": "test"}} + expected_sparse = {"test_namespace": {"default_ft": "test"}} featurized = base.embed({"test_namespace": "test"}, MockEncoder()) assert featurized.sparse == expected_sparse assert featurized.dense == {} @@ -56,7 +56,7 @@ def test_context_w_namespace_no_emb() -> None: def test_context_w_namespace_w_emb() -> None: str1 = "test" - expected_sparse = {"test_namespace": {"raw": str1}} + expected_sparse = {"test_namespace": {"default_ft": str1}} expected_dense = {"test_namespace": [4.0, 0.0]} featurized = base.embed({"test_namespace": base.Embed(str1)}, MockEncoder()) @@ -70,7 +70,7 @@ def test_context_w_namespace_w_emb() -> None: def test_context_w_namespace_w_emb2() -> None: str1 = "test" - expected_sparse = {"test_namespace": {"raw": str1}} + expected_sparse = {"test_namespace": {"default_ft": str1}} expected_dense = {"test_namespace": [4.0, 0.0]} featurized = base.embed(base.Embed({"test_namespace": str1}), MockEncoder()) @@ -85,7 +85,7 @@ def test_context_w_namespace_w_emb2() -> None: def test_context_w_namespace_w_some_emb() -> None: str1 = "test" str2 = "test_" - expected_sparse = {"test_namespace": {"raw": str1}} + expected_sparse = {"test_namespace": {"default_ft": str1}} expected_dense = {"test_namespace2": [5.0, 0.0]} featurized = base.embed( {"test_namespace": str1, "test_namespace2": base.Embed(str2)}, MockEncoder() @@ -94,8 +94,8 @@ def test_context_w_namespace_w_some_emb() -> None: assert featurized.dense == expected_dense expected_sparse = { - "test_namespace": {"raw": str1}, - "test_namespace2": {"raw": str2}, + "test_namespace": {"default_ft": str1}, + "test_namespace2": {"default_ft": str2}, } featurized = base.embed( {"test_namespace": str1, "test_namespace2": base.EmbedAndKeep(str2)}, @@ -110,9 +110,9 @@ def test_simple_action_strlist_no_emb() -> None: str2 = "test2" str3 = "test3" expected_sparse = [ - {"a_namespace": {"raw": str1}}, - {"a_namespace": {"raw": str2}}, - {"a_namespace": {"raw": str3}}, + {"a_namespace": {"default_ft": str1}}, + {"a_namespace": {"default_ft": str2}}, + {"a_namespace": {"default_ft": str3}}, ] to_embed: List[Union[str, base._Embed]] = [str1, str2, str3] featurized = base.embed(to_embed, MockEncoder(), "a_namespace") @@ -128,9 +128,9 @@ def test_simple_action_strlist_w_emb() -> None: str3 = "test__" expected_sparse = [ - {"a_namespace": {"raw": str1}}, - {"a_namespace": {"raw": str2}}, - {"a_namespace": {"raw": str3}}, + {"a_namespace": {"default_ft": str1}}, + {"a_namespace": {"default_ft": str2}}, + {"a_namespace": {"default_ft": str3}}, ] expected_dense = [ {"a_namespace": [4.0, 0.0]}, @@ -158,7 +158,7 @@ def test_simple_action_strlist_w_some_emb() -> None: str2 = "test_" str3 = "test__" - expected_sparse = [{"a_namespace": {"raw": str1}}, {}, {}] + expected_sparse = [{"a_namespace": {"default_ft": str1}}, {}, {}] expected_dense = [{}, {"a_namespace": [5.0, 0.0]}, {"a_namespace": [6.0, 0.0]}] featurized = base.embed( [str1, base.Embed(str2), base.Embed(str3)], MockEncoder(), "a_namespace" @@ -173,9 +173,9 @@ def test_simple_action_strlist_w_some_emb() -> None: "a_namespace", ) expected_sparse = [ - {"a_namespace": {"raw": str1}}, - {"a_namespace": {"raw": str2}}, - {"a_namespace": {"raw": str3}}, + {"a_namespace": {"default_ft": str1}}, + {"a_namespace": {"default_ft": str2}}, + {"a_namespace": {"default_ft": str3}}, ] for i in range(len(featurized)): assert featurized[i].sparse == expected_sparse[i] @@ -187,9 +187,9 @@ def test_action_w_namespace_no_emb() -> None: str2 = "test2" str3 = "test3" expected_sparse = [ - {"test_namespace": {"raw": str1}}, - {"test_namespace": {"raw": str2}}, - {"test_namespace": {"raw": str3}}, + {"test_namespace": {"default_ft": str1}}, + {"test_namespace": {"default_ft": str2}}, + {"test_namespace": {"default_ft": str3}}, ] featurized = base.embed( @@ -210,9 +210,9 @@ def test_action_w_namespace_w_emb() -> None: str2 = "test_" str3 = "test__" expected_sparse = [ - {"test_namespace": {"raw": str1}}, - {"test_namespace": {"raw": str2}}, - {"test_namespace": {"raw": str3}}, + {"test_namespace": {"default_ft": str1}}, + {"test_namespace": {"default_ft": str2}}, + {"test_namespace": {"default_ft": str3}}, ] expected_dense = [ {"test_namespace": [4.0, 0.0]}, @@ -250,9 +250,9 @@ def test_action_w_namespace_w_emb2() -> None: str2 = "test_" str3 = "test__" expected_sparse = [ - {"test_namespace1": {"raw": str1}}, - {"test_namespace2": {"raw": str2}}, - {"test_namespace3": {"raw": str3}}, + {"test_namespace1": {"default_ft": str1}}, + {"test_namespace2": {"default_ft": str2}}, + {"test_namespace3": {"default_ft": str3}}, ] expected_dense = [ {"test_namespace1": [4.0, 0.0]}, @@ -294,7 +294,7 @@ def test_action_w_namespace_w_some_emb() -> None: str2 = "test_" str3 = "test__" expected_sparse = [ - {"test_namespace": {"raw": str1}}, + {"test_namespace": {"default_ft": str1}}, {}, {}, ] @@ -317,9 +317,9 @@ def test_action_w_namespace_w_some_emb() -> None: assert featurized[i].dense == expected_dense[i] expected_sparse = [ - {"test_namespace": {"raw": str1}}, - {"test_namespace": {"raw": str2}}, - {"test_namespace": {"raw": str3}}, + {"test_namespace": {"default_ft": str1}}, + {"test_namespace": {"default_ft": str2}}, + {"test_namespace": {"default_ft": str3}}, ] featurized = base.embed( [ @@ -339,9 +339,9 @@ def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: str2 = "test_" str3 = "test__" expected_sparse = [ - {"test_namespace2": {"raw": str1}}, - {"test_namespace2": {"raw": str2}}, - {"test_namespace2": {"raw": str3}}, + {"test_namespace2": {"default_ft": str1}}, + {"test_namespace2": {"default_ft": str2}}, + {"test_namespace2": {"default_ft": str3}}, ] expected_dense = [ {"test_namespace": [4.0, 0.0]}, @@ -362,9 +362,9 @@ def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: assert featurized[i].dense == expected_dense[i] expected_sparse = [ - {"test_namespace": {"raw": str1}, "test_namespace2": {"raw": str1}}, - {"test_namespace": {"raw": str2}, "test_namespace2": {"raw": str2}}, - {"test_namespace": {"raw": str3}, "test_namespace2": {"raw": str3}}, + {"test_namespace": {"default_ft": str1}, "test_namespace2": {"default_ft": str1}}, + {"test_namespace": {"default_ft": str2}, "test_namespace2": {"default_ft": str2}}, + {"test_namespace": {"default_ft": str3}, "test_namespace2": {"default_ft": str3}}, ] featurized = base.embed( [ @@ -383,8 +383,8 @@ def test_one_namespace_w_list_of_features_no_emb() -> None: str1 = "test1" str2 = "test2" expected_sparse = { - "test_namespace_0": {"raw": str1}, - "test_namespace_1": {"raw": str2}, + "test_namespace_0": {"default_ft": str1}, + "test_namespace_1": {"default_ft": str2}, } featurized = base.embed({"test_namespace": [str1, str2]}, MockEncoder()) @@ -395,7 +395,7 @@ def test_one_namespace_w_list_of_features_no_emb() -> None: def test_one_namespace_w_list_of_features_w_some_emb() -> None: str1 = "test" str2 = "test_" - expected_sparse = {"test_namespace_0": {"raw": str1}} + expected_sparse = {"test_namespace_0": {"default_ft": str1}} expected_dense = {"test_namespace_1": [5.0, 0.0]} featurized = base.embed({"test_namespace": [str1, base.Embed(str2)]}, MockEncoder()) From 56d69f0a02c7d493fbd618e28641311637558ee6 Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Wed, 15 Nov 2023 09:20:47 -0500 Subject: [PATCH 8/9] black --- tests/unit_tests/test_rl_loop_base_embedder.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/test_rl_loop_base_embedder.py b/tests/unit_tests/test_rl_loop_base_embedder.py index 2bf2bf0..18a259a 100644 --- a/tests/unit_tests/test_rl_loop_base_embedder.py +++ b/tests/unit_tests/test_rl_loop_base_embedder.py @@ -362,9 +362,18 @@ def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: assert featurized[i].dense == expected_dense[i] expected_sparse = [ - {"test_namespace": {"default_ft": str1}, "test_namespace2": {"default_ft": str1}}, - {"test_namespace": {"default_ft": str2}, "test_namespace2": {"default_ft": str2}}, - {"test_namespace": {"default_ft": str3}, "test_namespace2": {"default_ft": str3}}, + { + "test_namespace": {"default_ft": str1}, + "test_namespace2": {"default_ft": str1}, + }, + { + "test_namespace": {"default_ft": str2}, + "test_namespace2": {"default_ft": str2}, + }, + { + "test_namespace": {"default_ft": str3}, + "test_namespace2": {"default_ft": str3}, + }, ] featurized = base.embed( [ From 43ad2978dac5c7eb3664d2753c5317eeb359767c Mon Sep 17 00:00:00 2001 From: Alexey Taymanov Date: Wed, 15 Nov 2023 11:09:49 -0500 Subject: [PATCH 9/9] cleanup --- src/learn_to_pick/base.py | 3 +-- src/learn_to_pick/pick_best.py | 23 +++++++---------------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/src/learn_to_pick/base.py b/src/learn_to_pick/base.py index a449815..e1a85aa 100644 --- a/src/learn_to_pick/base.py +++ b/src/learn_to_pick/base.py @@ -165,10 +165,9 @@ def __init__( featurizer: Featurizer, formatter: Callable, vw_logger: VwLogger, - *args: Any, **kwargs: Any, ): - super().__init__(*args, **kwargs) + super().__init__(**kwargs) self.model_repo = model_repo self.vw_cmd = vw_cmd self.workspace = self.model_repo.load(vw_cmd) diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index 1682010..abf70d6 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -143,17 +143,19 @@ def _dotproducts(self, context, actions): for j in range(len(action_names)) } - def _generic_namespace(self, featurized): + @staticmethod + def _generic_namespace(featurized): result = base.SparseFeatures() for ns in featurized.sparse.keys(): if "default_ft" in featurized.sparse[ns]: result[ns] = featurized.sparse[ns]["default_ft"] return result - def _generic_namespaces(self, context, actions): - context["@"] = self._generic_namespace(context) + @staticmethod + def _generic_namespaces(context, actions): + context["@"] = PickBestFeaturizer._generic_namespace(context) for a in actions: - a["#"] = self._generic_namespace(a) + a["#"] = PickBestFeaturizer._generic_namespace(a) def featurize( self, event: PickBestEvent @@ -163,7 +165,7 @@ def featurize( if self.auto_embed: self._dotproducts(context, actions) - self._generic_namespaces(context, actions) + PickBestFeaturizer._generic_namespaces(context, actions) return context, actions, event.selected @@ -183,9 +185,6 @@ def vw_cb_formatter( class PickBestRandomPolicy(base.Policy[PickBestEvent]): - def __init__(self): - ... - def predict(self, event: PickBestEvent) -> List[Tuple[int, float]]: num_items = len(event.to_select_from) return [(i, 1.0 / num_items) for i in range(num_items)] @@ -265,14 +264,6 @@ def _call_after_predict_before_scoring( next_inputs = inputs.copy() - # only one key, value pair in event.to_select_from - value = next(iter(event.to_select_from.values())) - v = ( - value[event.selected.index] - if event.selected - else event.to_select_from.values() - ) - picked = {} for k, v in event.to_select_from.items(): picked[k] = v[event.selected.index]