From d381e79fb5563c4a721d6d855eeed734abca352b Mon Sep 17 00:00:00 2001
From: dalek-who <839991169@qq.com>
Date: Sun, 20 Sep 2020 00:12:59 +0800
Subject: [PATCH 1/6] DeepRankPreprocessor

---
 matchzoo/preprocessors/__init__.py            |   1 +
 .../preprocessors/deeprank_preprocessor.py    | 166 ++++++++++++++++++
 matchzoo/preprocessors/units/__init__.py      |   2 +
 .../preprocessors/units/frequency_counter.py  |  63 +++++++
 .../units/padding_left_and_right.py           |  56 ++++++
 5 files changed, 288 insertions(+)
 create mode 100644 matchzoo/preprocessors/deeprank_preprocessor.py
 create mode 100644 matchzoo/preprocessors/units/frequency_counter.py
 create mode 100644 matchzoo/preprocessors/units/padding_left_and_right.py
diff --git a/matchzoo/preprocessors/__init__.py b/matchzoo/preprocessors/__init__.py
index 90ba40c..7239be9 100755
--- a/matchzoo/preprocessors/__init__.py
+++ b/matchzoo/preprocessors/__init__.py
@@ -2,6 +2,7 @@
 from .naive_preprocessor import NaivePreprocessor
 from .basic_preprocessor import BasicPreprocessor
 from .bert_preprocessor import BertPreprocessor
+from .deeprank_preprocessor import DeepRankPreprocessor
 
 
 def list_available() -> list:
diff --git a/matchzoo/preprocessors/deeprank_preprocessor.py b/matchzoo/preprocessors/deeprank_preprocessor.py
new file mode 100644
index 0000000..317b3c7
--- /dev/null
+++ b/matchzoo/preprocessors/deeprank_preprocessor.py
@@ -0,0 +1,166 @@
+"""DeepRank Preprocessor."""
+
+from tqdm import tqdm
+import typing
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+
+tqdm.pandas()
+
+
+class DeepRankPreprocessor(BasePreprocessor):
+    """
+    DeepRank model preprocessor helper.
+        For pre-processing, all the words in documents and queries are white-space tokenized,
+        lower-cased, and stemmed using the Krovetz stemmer.
+        Stopword removal is performed on query and document words using the INQUERY stop list.
+        Words occurred less than 5 times in the collection are removed from all the document.
+        Querys are truncated below a max_length.
+
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param half_window_size: int, half of the match window size (not including the center word),
+        so the real window size is 2 * half_window_size + 1
+    :padding_token_index: int: vocabulary index of pad token, default 0
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.DeepRankPreprocessor(
+        ...     filter_low_freq=5,
+        ...     half_window_size=10,
+        ...     padding_token_index=0,
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['vocab_size']
+        105
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self,
+                 truncated_mode: str = 'pre',
+                 truncated_length_left: int = None,
+                 truncated_length_right: int = None,
+                 filter_low_freq: float = 5,
+                 half_window_size: int = 5,
+                 padding_token_index: int = 0):
+        """Initialization."""
+        super().__init__()
+
+        self._truncated_mode = truncated_mode
+        self._truncated_length_left = truncated_length_left
+        self._truncated_length_right = truncated_length_right
+        if self._truncated_length_left:
+            self._left_truncatedlength_unit = units.TruncatedLength(
+                self._truncated_length_left, self._truncated_mode
+            )
+        if self._truncated_length_right:
+            self._right_truncatedlength_unit = units.TruncatedLength(
+                self._truncated_length_right, self._truncated_mode
+            )
+
+        self._counter_unit = units.FrequencyCounter()
+        self._filter_unit = units.FrequencyFilter(
+            low=filter_low_freq,
+            mode="tf"
+        )
+
+        self._units = [
+            units.Tokenize(),
+            units.Lowercase(),
+            units.Stemming(stemmer="krovetz"),
+            units.stop_removal.StopRemoval(),  # todo: INQUERY stop list ?
+        ]
+
+        self._context["filter_low_freq"] = filter_low_freq
+        self._context["half_window_size"] = half_window_size
+        self._context['padding_unit'] = units.PaddingLeftAndRight(
+            left_padding_num=self._context["half_window_size"],
+            right_padding_num=self._context["half_window_size"],
+            pad_value=padding_token_index,
+        )
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        data_pack = data_pack.apply_on_text(chain_transform(self._units),
+                                            verbose=verbose)
+        fitted_counter_unit = build_unit_from_data_pack(self._counter_unit,
+                                                        data_pack,
+                                                        flatten=False,
+                                                        mode='right',
+                                                        verbose=verbose)
+        self._context['counter_unit'] = fitted_counter_unit
+        self._context['term_idf'] = fitted_counter_unit.context["idf"]
+
+        fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
+                                                       data_pack,
+                                                       flatten=False,
+                                                       mode='right',
+                                                       verbose=verbose)
+        data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
+                                            mode='right', verbose=verbose)
+        self._context['filter_unit'] = fitted_filter_unit
+
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+        self._context['vocab_unit'] = vocab_unit
+
+        vocab_size = len(vocab_unit.state['term_index'])
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+        # simple preprocessing
+        data_pack.apply_on_text(chain_transform(self._units), inplace=True,
+                                verbose=verbose)
+        # filter
+        data_pack.apply_on_text(self._context['filter_unit'].transform,
+                                mode='right', inplace=True, verbose=verbose)
+        # token to id
+        data_pack.apply_on_text(self._context['vocab_unit'].transform,
+                                mode='both', inplace=True, verbose=verbose)
+        # truncate
+        if self._truncated_length_left:
+            data_pack.apply_on_text(self._left_truncatedlength_unit.transform,
+                                    mode='left', inplace=True, verbose=verbose)
+        if self._truncated_length_right:
+            data_pack.apply_on_text(self._right_truncatedlength_unit.transform,
+                                    mode='right', inplace=True,
+                                    verbose=verbose)
+        # add length
+        data_pack.append_text_length(inplace=True, verbose=verbose)
+        data_pack.drop_empty(inplace=True)
+        # padding on left and right for matching window
+        data_pack.apply_on_text(self._context['padding_unit'].transform,
+                                mode='right', inplace=True, verbose=verbose)
+        return data_pack
diff --git a/matchzoo/preprocessors/units/__init__.py b/matchzoo/preprocessors/units/__init__.py
index 950b794..9531f6b 100755
--- a/matchzoo/preprocessors/units/__init__.py
+++ b/matchzoo/preprocessors/units/__init__.py
@@ -15,6 +15,8 @@
 from .character_index import CharacterIndex
 from .word_exact_match import WordExactMatch
 from .truncated_length import TruncatedLength
+from .frequency_counter import FrequencyCounter
+from .padding_left_and_right import PaddingLeftAndRight
 
 
 def list_available() -> list:
diff --git a/matchzoo/preprocessors/units/frequency_counter.py b/matchzoo/preprocessors/units/frequency_counter.py
new file mode 100644
index 0000000..049c309
--- /dev/null
+++ b/matchzoo/preprocessors/units/frequency_counter.py
@@ -0,0 +1,63 @@
+import collections
+import typing
+
+import numpy as np
+
+from .stateful_unit import StatefulUnit
+
+
+class FrequencyCounter(StatefulUnit):
+    """
+    Frequency counter unit.
+
+    Examples::
+        >>> import matchzoo as mz
+
+    To filter based on document frequency (df):
+        >>> unit = mz.preprocessors.units.FrequencyCounter()
+        >>> unit.fit([['A', 'B'], ['B', 'C']])
+        >>> unit.context(['A', 'B', 'C'])
+        {'tf': Counter({'A': 1, 'B': 2, 'C': 1}),
+         'df': Counter({'B': 2, 'A': 1, 'C': 1}),
+         'idf': Counter({'B': 1.0, 'A': 1.4054651081081644, 'C': 1.4054651081081644})}
+        >>> unit.transform(['A', 'B', 'C'])
+        ['A', 'B', 'C']
+
+    """
+
+    def __init__(self):
+        """Frequency counter unit."""
+        super().__init__()
+
+    def fit(self, list_of_tokens: typing.List[typing.List[str]]):
+        """Fit `list_of_tokens` by calculating tf/df/idf states."""
+
+        self._context["tf"] = self._tf(list_of_tokens)
+        self._context["df"] = self._df(list_of_tokens)
+        self._context["idf"] = self._idf(list_of_tokens)
+
+    def transform(self, input_: list) -> list:
+        """Transform do nothing."""
+        return input_
+
+    @classmethod
+    def _tf(cls, list_of_tokens: list) -> dict:
+        stats = collections.Counter()
+        for tokens in list_of_tokens:
+            stats.update(tokens)
+        return stats
+
+    @classmethod
+    def _df(cls, list_of_tokens: list) -> dict:
+        stats = collections.Counter()
+        for tokens in list_of_tokens:
+            stats.update(set(tokens))
+        return stats
+
+    @classmethod
+    def _idf(cls, list_of_tokens: list) -> dict:
+        num_docs = len(list_of_tokens)
+        stats = cls._df(list_of_tokens)
+        for key, val in stats.most_common():
+            stats[key] = np.log((1 + num_docs) / (1 + val)) + 1
+        return stats
\ No newline at end of file
diff --git a/matchzoo/preprocessors/units/padding_left_and_right.py b/matchzoo/preprocessors/units/padding_left_and_right.py
new file mode 100644
index 0000000..2290f91
--- /dev/null
+++ b/matchzoo/preprocessors/units/padding_left_and_right.py
@@ -0,0 +1,56 @@
+from .stateful_unit import StatefulUnit
+
+
+class PaddingLeftAndRight(StatefulUnit):
+    """
+    Vocabulary class.
+
+    :param pad_value: The string value for the padding position.
+    :param oov_value: The string value for the out-of-vocabulary terms.
+
+    Examples:
+        >>> vocab = Vocabulary(pad_value='[PAD]', oov_value='[OOV]')
+        >>> vocab.fit(['A', 'B', 'C', 'D', 'E'])
+        >>> term_index = vocab.state['term_index']
+        >>> term_index  # doctest: +SKIP
+        {'[PAD]': 0, '[OOV]': 1, 'D': 2, 'A': 3, 'B': 4, 'C': 5, 'E': 6}
+        >>> index_term = vocab.state['index_term']
+        >>> index_term  # doctest: +SKIP
+        {0: '[PAD]', 1: '[OOV]', 2: 'D', 3: 'A', 4: 'B', 5: 'C', 6: 'E'}
+
+        >>> term_index['out-of-vocabulary-term']
+        1
+        >>> index_term[0]
+        '[PAD]'
+        >>> index_term[42]
+        Traceback (most recent call last):
+            ...
+        KeyError: 42
+        >>> a_index = term_index['A']
+        >>> c_index = term_index['C']
+        >>> vocab.transform(['C', 'A', 'C']) == [c_index, a_index, c_index]
+        True
+        >>> vocab.transform(['C', 'A', '[OOV]']) == [c_index, a_index, 1]
+        True
+        >>> indices = vocab.transform(list('ABCDDZZZ'))
+        >>> ' '.join(vocab.state['index_term'][i] for i in indices)
+        'A B C D D [OOV] [OOV] [OOV]'
+
+    """
+
+    def __init__(self, left_padding_num: int, right_padding_num: int, pad_value: str or int = 0):
+        """Vocabulary unit initializer."""
+        super().__init__()
+        self._context["left_padding_num"] = left_padding_num
+        self._context["right_padding_num"] = right_padding_num
+        self._context["pad_value"] = pad_value
+
+    def fit(self, tokens: list):
+        """ Do nothing. """
+        pass
+
+    def transform(self, input_: list) -> list:
+        """Padding on left and right."""
+        return [self._context["pad_value"]] * self._context["left_padding_num"] + \
+               input_ + \
+               [self._context["pad_value"]] * self._context["right_padding_num"]
\ No newline at end of file

From 3ff3f5b728d5f2fe33a099ec805793ee584e2301 Mon Sep 17 00:00:00 2001
From: dalek-who <839991169@qq.com>
Date: Mon, 21 Sep 2020 15:43:32 +0800
Subject: [PATCH 2/6] deeprank callback: convert document to matching windows

---
 matchzoo/dataloader/callbacks/__init__.py |   1 +
 matchzoo/dataloader/callbacks/window.py   | 124 ++++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 matchzoo/dataloader/callbacks/window.py

diff --git a/matchzoo/dataloader/callbacks/__init__.py b/matchzoo/dataloader/callbacks/__init__.py
index eaeb31f..f10e9ef 100755
--- a/matchzoo/dataloader/callbacks/__init__.py
+++ b/matchzoo/dataloader/callbacks/__init__.py
@@ -4,3 +4,4 @@
 from .padding import BasicPadding
 from .padding import DRMMPadding
 from .padding import BertPadding
+from .window import Window
diff --git a/matchzoo/dataloader/callbacks/window.py b/matchzoo/dataloader/callbacks/window.py
new file mode 100644
index 0000000..a1bc2d8
--- /dev/null
+++ b/matchzoo/dataloader/callbacks/window.py
@@ -0,0 +1,124 @@
+from typing import List, Dict, Tuple
+from itertools import product, chain, zip_longest
+
+import numpy as np
+
+import matchzoo as mz
+from matchzoo.engine.base_callback import BaseCallback
+
+
+class Window(BaseCallback):
+    """
+    Generate document match window for each query term.
+    :param half_window_size: half of the matching-window size, not including the center word
+        so the full window size is 2 * half_window_size + 1
+    :param max_match: a term should have fewer than max_match matching-windows,
+        the excess will be discarded
+
+    Example:
+        >>> import matchzoo as mz
+        >>> from matchzoo.dataloader.callbacks import Ngram
+        >>> data = mz.datasets.toy.load_data()
+        >>> preprocessor = mz.preprocessors.BasicPreprocessor(ngram_size=3)
+        >>> data = preprocessor.fit_transform(data)
+        >>> callback = Ngram(preprocessor=preprocessor, mode='index')
+        >>> dataset = mz.dataloader.Dataset(
+        ...     data, callbacks=[callback])
+        >>> _ = dataset[0]
+
+    """
+
+    def __init__(
+        self,
+        half_window_size: int=5,
+        max_match: int=20,
+    ):
+        """Init."""
+        self._half_window_size = half_window_size
+        self._max_match = max_match
+
+    def on_batch_unpacked(self, x, y):
+        """
+        Extract `window_of_term_right`, `window_position_of_term_right`,
+         `window_num_of_term_right` for `x`.
+        """
+        batch_size = len(x['text_left'])
+        x['window_of_term_right'] = [... for _ in range(batch_size)]
+        x['window_position_of_term_right'] = [... for _ in range(batch_size)]
+        x['window_num_of_term_right'] = [... for _ in range(batch_size)]
+        for idx, (query, query_len, doc, doc_len) in enumerate(zip(
+                x['text_left'], x['length_left'], x['text_right'], x['length_right'])):
+            window_of_term, window_position_of_term, window_num_of_term = \
+                self._build_window(query, query_len, doc, doc_len)
+            x['window_of_term_right'][idx] = window_of_term
+            x['window_position_of_term_right'][idx] = window_position_of_term
+            x['window_num_of_term_right'][idx] = window_num_of_term
+        
+        array_window_num_of_query_right = np.array([array.shape[0] for array in x['window_of_term_right']])
+        array_window_of_term_right = _pad_sequence(x['window_of_term_right'], pad_value=-1)
+        array_window_position_of_term_right = _pad_sequence(x['window_position_of_term_right'], pad_value=-1)
+        array_window_num_of_term_right = _pad_sequence(x['window_num_of_term_right'], pad_value=-1)
+
+        x['window_num_of_query_right'] = array_window_num_of_query_right
+        x['window_of_term_right'] = array_window_of_term_right
+        x['window_position_of_term_right'] = array_window_position_of_term_right
+        x['window_num_of_term_right'] = array_window_num_of_term_right
+        pass
+
+
+    def _build_window(self, query: list, query_len: int, doc: list, doc_len: int) \
+            -> Tuple[np.ndarray, np.ndarray, list]:
+        window_of_term = [[] for _ in range(query_len)]
+        window_position_of_term = [[] for _ in range(query_len)]
+        window_num_of_term = [0 for _ in range(query_len)]
+
+        # get doc window for each query term
+        for doc_window_position, query_term_position in product(range(doc_len), range(query_len)):
+            if window_num_of_term[query_term_position] > self._max_match:
+                continue
+            padding_doc_window_position = doc_window_position + self._half_window_size
+            doc_term = doc[padding_doc_window_position]
+            query_term = query[query_term_position]
+            if query_term == doc_term:
+                window = self._get_window(doc=doc, center=padding_doc_window_position)
+                # window: list, len=full_window_size, element: int, token_id
+                window_of_term[query_term_position].append(window)
+                window_position_of_term[query_term_position].append(doc_window_position)
+                window_num_of_term[query_term_position] += 1
+
+        # window_of_term: list[list[list[int]]]:  len=query_len,
+        #   window_of_term[i]: list[list]:  len: window_num of term_i
+        #       window_of_term[i][j]: list: len: len=full_window_size,
+        #           window_of_term[i][j][k]: int, token_id
+        #
+        # window_position_of_term: list[list[int]]: len=query_len
+        #   window_position_of_term[i]: list[int]: len: window_num
+        #       window_position_of_term[i][j]: int, position index of window center
+        #
+        # window_num_of_term: list[int], len=query_len
+        #   window_num_of_term[i]: int, window_num of term_i, sum()
+
+        # flatten
+        window_of_term = list(chain.from_iterable(window_of_term))
+        window_position_of_term = list(chain.from_iterable(window_position_of_term))
+
+        # to array
+        window_of_term = np.stack(window_of_term, axis=0) if len(window_of_term)>0 \
+            else np.zeros((0, 2*self._half_window_size + 1), dtype=np.long)
+        window_position_of_term = np.array(window_position_of_term)
+        window_num_of_term = np.array(window_num_of_term)
+
+        return window_of_term, window_position_of_term, window_num_of_term
+
+    def _get_window(self, doc: list, center: int) -> list:
+        return doc[center - self._half_window_size: center + self._half_window_size + 1]
+
+def _pad_sequence(list_of_array: List[np.ndarray], pad_value):
+    batch_size = len(list_of_array)
+    max_shape = np.array([array.shape for array in list_of_array]).max(axis=0).tolist()
+    batch_array = np.ones([batch_size, *max_shape], dtype=list_of_array[0].dtype) * pad_value
+    for i in range(batch_size):
+        array = list_of_array[i]
+        array_slice = [slice(None, end, None) for end in array.shape]
+        batch_array[(i, *array_slice)] = array
+    return batch_array
\ No newline at end of file

From 01fef0489964e1b76ae53424cd6180e0903058dd Mon Sep 17 00:00:00 2001
From: dalek-who <839991169@qq.com>
Date: Mon, 21 Sep 2020 15:44:12 +0800
Subject: [PATCH 3/6] deeprank callback: convert document to matching windows

---
 matchzoo/dataloader/callbacks/window.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/matchzoo/dataloader/callbacks/window.py b/matchzoo/dataloader/callbacks/window.py
index a1bc2d8..fb46c3c 100644
--- a/matchzoo/dataloader/callbacks/window.py
+++ b/matchzoo/dataloader/callbacks/window.py
@@ -53,7 +53,7 @@ def on_batch_unpacked(self, x, y):
             x['window_of_term_right'][idx] = window_of_term
             x['window_position_of_term_right'][idx] = window_position_of_term
             x['window_num_of_term_right'][idx] = window_num_of_term
-        
+
         array_window_num_of_query_right = np.array([array.shape[0] for array in x['window_of_term_right']])
         array_window_of_term_right = _pad_sequence(x['window_of_term_right'], pad_value=-1)
         array_window_position_of_term_right = _pad_sequence(x['window_position_of_term_right'], pad_value=-1)

From ef9d2f35a03e2933ad0e8f0520aee768043016cf Mon Sep 17 00:00:00 2001
From: dalek-who <839991169@qq.com>
Date: Wed, 30 Sep 2020 17:20:42 +0800
Subject: [PATCH 4/6] DeepRank model

---
 matchzoo/models/__init__.py  |   1 +
 matchzoo/models/deep_rank.py | 410 +++++++++++++++++++++++++++++++++++
 2 files changed, 411 insertions(+)
 create mode 100644 matchzoo/models/deep_rank.py

diff --git a/matchzoo/models/__init__.py b/matchzoo/models/__init__.py
index 161ab47..3cff0ae 100644
--- a/matchzoo/models/__init__.py
+++ b/matchzoo/models/__init__.py
@@ -22,6 +22,7 @@
 from .re2 import RE2
 from .hcrn import HCRN
 from .dynamic_clip import DynamicClip
+from .deep_rank import DeepRank
 
 def list_available() -> list:
     from matchzoo.engine.base_model import BaseModel
diff --git a/matchzoo/models/deep_rank.py b/matchzoo/models/deep_rank.py
new file mode 100644
index 0000000..887b6da
--- /dev/null
+++ b/matchzoo/models/deep_rank.py
@@ -0,0 +1,410 @@
+"""An implementation of DSSM, Deep Structured Semantic Model."""
+import typing
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+
+from matchzoo import preprocessors
+from matchzoo.engine.param_table import ParamTable
+from matchzoo.engine.param import Param
+from matchzoo.engine.base_model import BaseModel
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from matchzoo.dataloader import callbacks
+from matchzoo.engine.base_callback import BaseCallback
+
+
+class DeepRank(BaseModel):
+    """
+    Deep structured semantic model.
+
+    Examples:
+        >>> model = DeepRank()
+        >>> embedding_matrix = np.ones((3000, 50), dtype=float)
+        >>> term_weight_embedding_matrix = np.ones((3000, 1), dtype=float)
+        >>> model.params['embedding'] = embedding_matrix
+        >>> model.params['embedding_input_dim'] = embedding_matrix.shape[0]
+        >>> model.params['embedding_output_dim'] = embedding_matrix.shape[1]
+        >>> model.params['term_weight_embedding'] = term_weight_embedding_matrix
+        >>> model.params['embedding_freeze'] = False
+        >>> model.guess_and_fill_missing_params(verbose=0)
+        >>> model.build()
+
+    """
+
+    @classmethod
+    def get_default_params(cls) -> ParamTable:
+        """:return: model default parameters."""
+        params = super().get_default_params(
+            with_multi_layer_perceptron=False, with_embedding=True)
+        params.add(Param(name='term_weight_embedding',
+                         desc="Query term weight embedding matrix"))
+        params.add(Param(name='reduce_out_dim', value=1,
+                         desc="Output dimension of word embedding reduction"))
+        params.add(Param(name='reduce_conv_kernel_size', value=1,
+                         desc="Kernel size of convolution word embedding reduction"))
+        params.add(Param(name='reduce_conv_stride', value=1,
+                         desc="Stride of convolution word embedding reduction"))
+        params.add(Param(name='reduce_conv_padding', value=0,
+                         desc="Zero-padding added to both side of convolution \
+                         word embedding reduction"))
+        params.add(Param(name='half_window_size', value=5,
+                         desc="Half of matching-window size, not including center term"))
+        params.add(Param(name='encode_out_dim', value=4,
+                         desc="Output dimension of encode"))
+        params.add(Param(name='encode_conv_kernel_size', value=3,
+                         desc="Kernel size of convolution encode"))
+        params.add(Param(name='encode_conv_stride', value=1,
+                         desc="Stride of convolution encode"))
+        params.add(Param(name='encode_conv_padding', value=1,
+                         desc="Zero-padding added to both side of convolution encode"))
+        params.add(Param(name='encode_pool_out', value=1,
+                         desc="Pooling size of global max-pooling for encoding matrix"))
+        params.add(Param(name='encode_leaky', value=0.2,
+                         desc="Relu leaky of encoder"))
+        params.add(Param(name='gru_hidden_dim', value=3,
+                         desc="Aggregation Network gru hidden dimension"))
+
+        return params
+
+    @classmethod
+    def get_default_preprocessor(
+        cls,
+        truncated_mode: str = 'pre',
+        truncated_length_left: int = None,
+        truncated_length_right: int = None,
+        filter_low_freq: float = 5,
+        half_window_size: int = 5,
+        padding_token_index: int = 0
+    ) -> BasePreprocessor:
+        """
+        Model default preprocessor.
+
+        The preprocessor's transform should produce a correctly shaped data
+        pack that can be used for training.
+
+        :return: Default preprocessor.
+        """
+        return preprocessors.DeepRankPreprocessor(
+            truncated_mode=truncated_mode,
+            truncated_length_left=truncated_length_left,
+            truncated_length_right=truncated_length_right,
+            filter_low_freq=filter_low_freq,
+            half_window_size=half_window_size,
+            padding_token_index=padding_token_index,
+        )
+
+    @classmethod
+    def get_default_padding_callback(
+            cls,
+            pad_word_value: typing.Union[int, str] = 0,
+            pad_word_mode: str = 'post',
+    ) -> BaseCallback:
+        """
+        Only padding query.
+
+        :return: Default padding callback.
+        """
+        return callbacks.BasicPadding(
+            pad_word_value=pad_word_value,
+            pad_word_mode=pad_word_mode
+        )
+
+    def build(self):
+        """Build model structure."""
+        # self.embedding = self._make_embedding_layer(
+        #     freeze=self._params["embedding_freeze"],
+        #     embedding=self._params["embedding"]
+        # )
+        self.embedding = self._make_default_embedding_layer()
+
+
+        if self._params["term_weight_embedding"] is not None:
+            term_weight_embedding = self._params["term_weight_embedding"]
+        else:
+            # self._params['embedding_input_dim'] = (
+            #     self._params['embedding'].shape[0]
+            # )
+            # self._params['embedding_output_dim'] = 1
+            term_weight_embedding = np.ones(
+                (self._params["embedding_input_dim"], 1), dtype=float)
+        self.term_weight_embedding = self._make_embedding_layer(
+            freeze=self._params["embedding_freeze"],
+            embedding=term_weight_embedding
+        )
+
+        self.query_reduce = nn.Conv1d(
+            in_channels=self._params["embedding_output_dim"],
+            out_channels=self._params["reduce_out_dim"],
+            kernel_size=self._params["reduce_conv_kernel_size"],
+            stride=self._params["reduce_conv_stride"],
+            padding=self._params["reduce_conv_padding"]
+        )
+
+        self.doc_reduce = nn.Conv1d(
+            in_channels=self._params["embedding_output_dim"],
+            out_channels=self._params["reduce_out_dim"],
+            kernel_size=self._params["reduce_conv_kernel_size"],
+            stride=self._params["reduce_conv_stride"],
+            padding=self._params["reduce_conv_padding"]
+        )
+
+        interact_tensor_channel = 2 * self._params["reduce_out_dim"] + 1
+        self.encoding = nn.Sequential(
+            nn.Conv2d(
+                in_channels=interact_tensor_channel,
+                out_channels=self._params["encode_out_dim"],
+                kernel_size=self._params["encode_conv_kernel_size"],
+                stride=self._params["encode_conv_stride"],
+                padding=self._params["encode_conv_padding"],
+            ),
+            nn.AdaptiveAvgPool2d(output_size=self._params["encode_pool_out"]),
+            nn.LeakyReLU(self._params["encode_leaky"])
+        )
+
+        gru_in_dim = \
+            self._params["encode_out_dim"] * self._params["encode_pool_out"]**2 + 1
+        self.gru = nn.GRU(
+            input_size=gru_in_dim,
+            hidden_size=self._params["gru_hidden_dim"],
+            bidirectional=True
+        )
+
+        self.pool = nn.AdaptiveAvgPool1d(1)
+
+        self.out = self._make_output_layer(
+            in_features=2 * self._params["gru_hidden_dim"]
+        )
+
+    def forward(self, inputs):
+        """Forward."""
+        # Process left & right input.
+        all_query: torch.LongTensor = inputs["text_left"]
+        all_query_term_window_num: torch.LongTensor = inputs['term_window_num']
+        all_query_len: torch.LongTensor = inputs["length_left"]
+
+        all_window: torch.LongTensor = inputs['window_right']
+        all_window_position: torch.LongTensor = inputs['window_position_right']
+        all_query_window_num: torch.LongTensor = inputs['query_window_num']
+
+        # all_query: [batch_size, max_q_seq_len]
+        # all_query_term_window_num: [batch_size, max_q_seq_len]
+        # all_query_len: [batch_size]
+
+        # all_window: [batch_size, max_window_num, full_window_size]
+        # all_window_position: [batch_size, max_window_num]
+        # all_query_window_num: [batch_size]
+
+        batch_size: int = all_query.shape[0]
+        full_window_size: int = all_window.shape[2]
+        device: torch.device = all_query.device
+
+        ##############################
+        # query embedding and reduce
+        ##############################
+
+        # all_query embedding
+        # all_query: [batch_size, max_q_seq_len]
+        #   -embedding-> [batch_size, max_q_seq_len, embed_dim]
+        #   -permute(0, 2, 1)-> [batch_size, embed_dim, max_q_seq_len]
+        all_query_embed = self.embedding(all_query).permute(0, 2, 1)
+
+        # all_query reduce
+        # all_query_embed:  [batch_size, embed_dim, max_q_seq_len]
+        #   -all_query_reduce-> [batch_size, reduce_out_dim, max_q_seq_len]
+        all_query_reduce = self.query_reduce(all_query_embed)
+
+        ##############################
+        # query term weight
+        ##############################
+
+        # all_query: [batch_size, max_q_seq_len]
+        #   -term_weight_embedding-> [batch_size, max_q_seq_len, 1]
+        #   -squeeze-> [batch_size, max_q_seq_len]
+        all_query_term_weight = self.term_weight_embedding(all_query).squeeze(2)
+
+        out = []
+        for i in range(batch_size):
+            one_query_seq_len = all_query_len[i].item()
+            one_query_window_num = all_query_window_num[i].item()
+            if one_query_window_num == 0:
+                out.append(torch.zeros(self._params["gru_hidden_dim"] * 2, device=device))
+                continue
+
+            ##############################
+            # window embedding and reduce
+            ##############################
+
+            # all_window: [batch_size, max_window_num, full_window_size]
+            #   -[i]-> [one_query_window_num, full_window_size]
+            one_query_windows = all_window[i][:one_query_window_num]
+
+            # one_query_windows: [one_query_window_num, full_window_size]
+            #   -embedding-> [one_query_window_num, full_window_size, embed_dim]
+            #   -permute(0, 2, 1)-> [one_query_window_num, embed_dim, full_window_size]
+            one_query_windows_embed = self.embedding(one_query_windows).permute(0, 2, 1)
+
+            # one_query_windows_embed: [one_query_window_num, embed_dim, full_window_size]
+            #   -doc_reduce-> [one_query_window_num, reduce_out_dim, full_window_size]
+            one_query_windows_reduce = self.doc_reduce(one_query_windows_embed)
+
+            ##############################
+            # interaction signal
+            ##############################
+
+            # all_query_embed: [batch_size, embed_dim, max_q_seq_len]
+            #   -[]-> [embed_dim, one_query_seq_len]
+            one_query_embed = all_query_embed[i, :, :one_query_seq_len]
+
+            # one_query_embed: [embed_dim, one_query_seq_len]
+            # one_query_windows_embed: [one_query_window_num, embed_dim, full_window_size]
+            #   -einsum("el,new->nlw")->
+            #       [one_query_window_num, one_query_seq_len, full_window_size]
+            #   -[]-> [one_query_window_num, 1, one_query_seq_len, full_window_size]
+            interacition_signal = torch.einsum(
+                "el,new->nlw", one_query_embed, one_query_windows_embed)[:, None, :, :]
+
+            ##############################
+            # encoding
+            ##############################
+
+            # all_query_reduce: [batch_size, reduce_out_dim, max_q_seq_len]
+            #   -[]-> [reduce_out_dim, one_query_seq_len]
+            one_query_reduce = all_query_reduce[i, :, :one_query_seq_len]
+
+            # one_query_reduce: [reduce_dim, one_query_seq_len]
+            #   -[]-> [1, reduce_dim, one_query_seq_len, 1]
+            #   -expand->
+            #     [one_query_window_num, reduce_dim, one_query_seq_len, full_window_size]
+            one_query_reduce_expand = one_query_reduce[None, :, :, None] \
+                .expand(one_query_window_num, -1, -1, full_window_size)
+
+            # one_query_windows_reduce:
+            #   [one_query_window_num, reduce_dim, full_window_size]
+            #   -[]-> [one_query_window_num, reduce_dim, 1, full_window_size]
+            #   -expand->
+            #     [one_query_window_num, reduce_dim, one_query_seq_len, full_window_size]
+            one_query_windows_reduce_expand = one_query_windows_reduce[:, :, None, :] \
+                .expand(-1, -1, one_query_seq_len, -1)
+
+            # one_query_reduce_expand:
+            #   [one_query_window_num, reduce_dim, one_query_seq_len, full_window_size]
+            # one_query_windows_reduce_expand:
+            #   [one_query_window_num, reduce_dim, one_query_seq_len, full_window_size]
+            # interacition_signal:
+            #   [one_query_window_num, 1, one_query_seq_len, full_window_size]
+            #   -stack->
+            #     [one_query_window_num, 2*reduce_dim + 1,
+            #       one_query_seq_len, full_window_size]
+            encoder_input_tensor = torch.cat(
+                [one_query_reduce_expand,
+                 one_query_windows_reduce_expand,
+                 interacition_signal],
+                dim=1)
+
+            # encoder_input_tensor:
+            #       [one_query_window_num, 2*reduce_dim + 1,
+            #        one_query_seq_len, full_window_size]
+            #   -encoding->
+            #     -Conv2d->
+            #       [one_query_window_num, encode_out_dim,
+            #        one_query_seq_len, full_window_size]
+            #     -AdaptiveAvgPool2d+ReLU->
+            #       [one_query_window_num, encode_out_dim,
+            #        encode_pool_out, encode_pool_out]
+            #   -flatten->
+            #       [one_query_window_num, encode_out_dim * encode_pool_out^2]
+            encoder_output_tensor = self.encoding(encoder_input_tensor).flatten(1)
+
+            ##############################
+            # position encoding and gru
+            ##############################
+
+            # all_window_position: [batch_size, max_window_num]
+            #   -[]-> [one_query_window_num, 1]
+            one_query_window_position = \
+                all_window_position[i, :one_query_window_num, None]
+
+            # one_query_window_position: [one_query_window_num, 1]
+            #   --> [one_query_window_num, 1]
+            window_position_encoding = (1. / (one_query_window_position + 1.)).float()
+
+            # encoder_output_tensor:
+            #   [one_query_window_num, encode_out_dim * encode_pool_out^2]
+            # window_position_encoding:
+            #   [one_query_window_num, 1]  gru_in_dim
+            #   -cat->
+            #       [one_query_window_num, gru_in_dim],
+            #        gru_in_dim = encode_out_dim * encode_pool_out^2 + 1
+            enc_and_pos = \
+                torch.cat([encoder_output_tensor, window_position_encoding], dim=1)
+
+            # all_query_term_window_num: [batch_size, max_q_seq_len]
+            #   -[]-> [one_query_seq_len]
+            #   -tolist-> list, len=one_query_seq_len
+            one_query_term_window_num = \
+                all_query_term_window_num[i, :one_query_seq_len].tolist()
+
+            # enc_and_pos: [one_query_window_num, gru_in_dim]
+            #   -split->
+            #       tuple of tensor, len=one_query_seq_len,
+            #           element i: tensor, [one_query_term_window_num[i], gru_in_dim]
+            #   -pad_sequence->
+            #       [max(one_query_term_window_num[i]), one_query_seq_len, gru_in_dim]
+            one_query_pad_split_windows = \
+                pad_sequence(enc_and_pos.split(one_query_term_window_num, dim=0))
+
+            # one_query_pad_split_windows:
+            #   [max(one_query_term_window_num[i]), one_query_seq_len, gru_in_dim]
+            #   -gru->
+            #     gru_out: [max(one_query_term_window_num[i]),
+            #               one_query_seq_len, gru_hidden_dim * 2]
+            #     gru_hidden: [num_layers, one_query_seq_len, gru_hidden_dim * 2]
+            gru_out, gru_hidden = self.gru(one_query_pad_split_windows)
+            # type: torch.Tensor, torch.Tensor
+
+            ##############################
+            # aggregate
+            ##############################
+
+            # gru_out:
+            #       [max(one_query_term_window_num[i]),
+            #        one_query_seq_len, gru_hidden_dim * 2]
+            #   -permute(1,2,0)->
+            #       [one_query_seq_len, gru_hidden_dim * 2,
+            #        max(one_query_term_window_num[i])]
+            gru_out = gru_out.permute(1, 2, 0)
+
+            # gru_out:
+            #       [one_query_seq_len, gru_hidden_dim * 2,
+            #        max(one_query_term_window_num[i])]
+            #   -pool-> [one_query_seq_len, gru_hidden_dim * 2, 1]
+            #   -squeeze-> [one_query_seq_len, gru_hidden_dim * 2]
+            pool_gru_out = self.pool(gru_out).squeeze(2)
+
+            # all_query_term_weight: [batch_size, max_q_seq_len]
+            #   -[]-> [one_query_seq_len]
+            one_query_term_weight = all_query_term_weight[i, :one_query_seq_len]
+
+            # pool_gru_out: [one_query_seq_len, gru_hidden_dim * 2]
+            # one_query_term_weight: [one_query_seq_len]
+            #   -einsum-> [gru_hidden_dim * 2]
+            final_embed = torch.einsum("lh,l->h", pool_gru_out, one_query_term_weight)
+
+            out.append(final_embed)
+            pass
+
+        ##############################
+        # output score
+        ##############################
+
+        # out: list, len=batch_size, element i: tensor, [gru_hidden_dim * 2]
+        #   -stack-> [batch_size, gru_hidden_dim * 2]
+        out = torch.stack(out, dim=0)
+
+        # out: [batch_size, gru_hidden_dim * 2]
+        #   -out-> [batch_size, out_dim]
+        out = self.out(out)
+        return out

From 503947e9e7442c502bcfeb0621d26ec10ebcf533 Mon Sep 17 00:00:00 2001
From: dalek-who <839991169@qq.com>
Date: Wed, 30 Sep 2020 17:21:45 +0800
Subject: [PATCH 5/6] window unit, flake8 pass

---
 matchzoo/dataloader/callbacks/window.py | 85 +++++++++++++------------
 1 file changed, 44 insertions(+), 41 deletions(-)

diff --git a/matchzoo/dataloader/callbacks/window.py b/matchzoo/dataloader/callbacks/window.py
index fb46c3c..db984f1 100644
--- a/matchzoo/dataloader/callbacks/window.py
+++ b/matchzoo/dataloader/callbacks/window.py
@@ -10,8 +10,9 @@
 class Window(BaseCallback):
     """
     Generate document match window for each query term.
-    :param half_window_size: half of the matching-window size, not including the center word
-        so the full window size is 2 * half_window_size + 1
+
+    :param half_window_size: half of the matching-window size, not including the
+        center word, so the full window size is 2 * half_window_size + 1
     :param max_match: a term should have fewer than max_match matching-windows,
         the excess will be discarded
 
@@ -30,61 +31,59 @@ class Window(BaseCallback):
 
     def __init__(
         self,
-        half_window_size: int=5,
-        max_match: int=20,
+        half_window_size: int = 5,
+        max_match: int = 20,
     ):
         """Init."""
         self._half_window_size = half_window_size
         self._max_match = max_match
 
     def on_batch_unpacked(self, x, y):
-        """
-        Extract `window_of_term_right`, `window_position_of_term_right`,
-         `window_num_of_term_right` for `x`.
-        """
+        """Extract `window_right`, `window_position_right`, `term_window_num` for `x`."""
         batch_size = len(x['text_left'])
-        x['window_of_term_right'] = [... for _ in range(batch_size)]
-        x['window_position_of_term_right'] = [... for _ in range(batch_size)]
-        x['window_num_of_term_right'] = [... for _ in range(batch_size)]
+        x['window_right'] = [... for _ in range(batch_size)]
+        x['window_position_right'] = [... for _ in range(batch_size)]
+        x['term_window_num'] = [... for _ in range(batch_size)]
         for idx, (query, query_len, doc, doc_len) in enumerate(zip(
                 x['text_left'], x['length_left'], x['text_right'], x['length_right'])):
-            window_of_term, window_position_of_term, window_num_of_term = \
+            window_right, window_position_right, term_window_num = \
                 self._build_window(query, query_len, doc, doc_len)
-            x['window_of_term_right'][idx] = window_of_term
-            x['window_position_of_term_right'][idx] = window_position_of_term
-            x['window_num_of_term_right'][idx] = window_num_of_term
-
-        array_window_num_of_query_right = np.array([array.shape[0] for array in x['window_of_term_right']])
-        array_window_of_term_right = _pad_sequence(x['window_of_term_right'], pad_value=-1)
-        array_window_position_of_term_right = _pad_sequence(x['window_position_of_term_right'], pad_value=-1)
-        array_window_num_of_term_right = _pad_sequence(x['window_num_of_term_right'], pad_value=-1)
+            x['window_right'][idx] = window_right
+            x['window_position_right'][idx] = window_position_right
+            x['term_window_num'][idx] = term_window_num
 
-        x['window_num_of_query_right'] = array_window_num_of_query_right
-        x['window_of_term_right'] = array_window_of_term_right
-        x['window_position_of_term_right'] = array_window_position_of_term_right
-        x['window_num_of_term_right'] = array_window_num_of_term_right
-        pass
+        array_query_window_num = np.array([array.shape[0] for array in x['window_right']])
+        array_window_right = _pad_sequence(x['window_right'], pad_value=-1)
+        array_window_position_right = \
+            _pad_sequence(x['window_position_right'], pad_value=-1)
+        array_term_window_num = _pad_sequence(x['term_window_num'], pad_value=-1)
 
+        x['query_window_num'] = array_query_window_num
+        x['window_right'] = array_window_right
+        x['window_position_right'] = array_window_position_right
+        x['term_window_num'] = array_term_window_num
 
     def _build_window(self, query: list, query_len: int, doc: list, doc_len: int) \
-            -> Tuple[np.ndarray, np.ndarray, list]:
+            -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         window_of_term = [[] for _ in range(query_len)]
         window_position_of_term = [[] for _ in range(query_len)]
         window_num_of_term = [0 for _ in range(query_len)]
 
         # get doc window for each query term
-        for doc_window_position, query_term_position in product(range(doc_len), range(query_len)):
-            if window_num_of_term[query_term_position] > self._max_match:
-                continue
+        for doc_window_position in range(doc_len):
             padding_doc_window_position = doc_window_position + self._half_window_size
             doc_term = doc[padding_doc_window_position]
-            query_term = query[query_term_position]
-            if query_term == doc_term:
-                window = self._get_window(doc=doc, center=padding_doc_window_position)
-                # window: list, len=full_window_size, element: int, token_id
-                window_of_term[query_term_position].append(window)
-                window_position_of_term[query_term_position].append(doc_window_position)
-                window_num_of_term[query_term_position] += 1
+            for query_term_position in range(query_len):
+                if window_num_of_term[query_term_position] > self._max_match:
+                    continue
+                query_term = query[query_term_position]
+                if query_term == doc_term:
+                    window = self._get_window(doc=doc, center=padding_doc_window_position)
+                    # window: list, len=full_window_size, element: int, token_id
+                    window_of_term[query_term_position].append(window)
+                    window_position_of_term[query_term_position].append(
+                        doc_window_position)
+                    window_num_of_term[query_term_position] += 1
 
         # window_of_term: list[list[list[int]]]:  len=query_len,
         #   window_of_term[i]: list[list]:  len: window_num of term_i
@@ -103,8 +102,8 @@ def _build_window(self, query: list, query_len: int, doc: list, doc_len: int) \
         window_position_of_term = list(chain.from_iterable(window_position_of_term))
 
         # to array
-        window_of_term = np.stack(window_of_term, axis=0) if len(window_of_term)>0 \
-            else np.zeros((0, 2*self._half_window_size + 1), dtype=np.long)
+        window_of_term = np.stack(window_of_term, axis=0) if len(window_of_term) > 0 \
+            else np.zeros((0, 2 * self._half_window_size + 1), dtype=np.long)
         window_position_of_term = np.array(window_position_of_term)
         window_num_of_term = np.array(window_num_of_term)
 
@@ -113,12 +112,16 @@ def _build_window(self, query: list, query_len: int, doc: list, doc_len: int) \
     def _get_window(self, doc: list, center: int) -> list:
         return doc[center - self._half_window_size: center + self._half_window_size + 1]
 
+
 def _pad_sequence(list_of_array: List[np.ndarray], pad_value):
+    """Padding list of array to an array, like pytorch pad_sequence."""
     batch_size = len(list_of_array)
-    max_shape = np.array([array.shape for array in list_of_array]).max(axis=0).tolist()
-    batch_array = np.ones([batch_size, *max_shape], dtype=list_of_array[0].dtype) * pad_value
+    max_shape = \
+        np.array([array.shape for array in list_of_array]).max(axis=0).tolist()
+    batch_array = \
+        np.ones([batch_size, *max_shape], dtype=list_of_array[0].dtype) * pad_value
     for i in range(batch_size):
         array = list_of_array[i]
         array_slice = [slice(None, end, None) for end in array.shape]
         batch_array[(i, *array_slice)] = array
-    return batch_array
\ No newline at end of file
+    return batch_array

From 3eaaefb7850db2161dd862f359a5ca95bb339887 Mon Sep 17 00:00:00 2001
From: dalek-who <839991169@qq.com>
Date: Wed, 30 Sep 2020 17:22:23 +0800
Subject: [PATCH 6/6] flake8 pass

---
 .../preprocessors/deeprank_preprocessor.py    | 15 ++++---
 .../preprocessors/units/frequency_counter.py  | 13 +++---
 .../units/padding_left_and_right.py           | 45 ++++++-------------
 3 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/matchzoo/preprocessors/deeprank_preprocessor.py b/matchzoo/preprocessors/deeprank_preprocessor.py
index 317b3c7..ff650cd 100644
--- a/matchzoo/preprocessors/deeprank_preprocessor.py
+++ b/matchzoo/preprocessors/deeprank_preprocessor.py
@@ -16,16 +16,19 @@
 class DeepRankPreprocessor(BasePreprocessor):
     """
     DeepRank model preprocessor helper.
-        For pre-processing, all the words in documents and queries are white-space tokenized,
-        lower-cased, and stemmed using the Krovetz stemmer.
-        Stopword removal is performed on query and document words using the INQUERY stop list.
-        Words occurred less than 5 times in the collection are removed from all the document.
+
+        For pre-processing, all the words in documents and queries are white-space
+        tokenized, lower-cased, and stemmed using the Krovetz stemmer.
+        Stopword removal is performed on query and document words using the
+        INQUERY stop list.
+        Words occurred less than 5 times in the collection are removed from all
+        the document.
         Querys are truncated below a max_length.
 
     :param filter_low_freq: Float, lower bound value used by
         :class:`FrequenceFilterUnit`.
-    :param half_window_size: int, half of the match window size (not including the center word),
-        so the real window size is 2 * half_window_size + 1
+    :param half_window_size: int, half of the match window size (not including
+        the center word), so the real window size is 2 * half_window_size + 1
     :padding_token_index: int: vocabulary index of pad token, default 0
 
     Example:
diff --git a/matchzoo/preprocessors/units/frequency_counter.py b/matchzoo/preprocessors/units/frequency_counter.py
index 049c309..d51e39d 100644
--- a/matchzoo/preprocessors/units/frequency_counter.py
+++ b/matchzoo/preprocessors/units/frequency_counter.py
@@ -11,17 +11,16 @@ class FrequencyCounter(StatefulUnit):
     Frequency counter unit.
 
     Examples::
+        >>> from collections import Counter
         >>> import matchzoo as mz
 
     To filter based on document frequency (df):
         >>> unit = mz.preprocessors.units.FrequencyCounter()
         >>> unit.fit([['A', 'B'], ['B', 'C']])
-        >>> unit.context(['A', 'B', 'C'])
-        {'tf': Counter({'A': 1, 'B': 2, 'C': 1}),
+        >>> unit.context
+        {'tf': Counter({'B': 2, 'A': 1, 'C': 1}),
          'df': Counter({'B': 2, 'A': 1, 'C': 1}),
-         'idf': Counter({'B': 1.0, 'A': 1.4054651081081644, 'C': 1.4054651081081644})}
-        >>> unit.transform(['A', 'B', 'C'])
-        ['A', 'B', 'C']
+         'idf': Counter({'A': 1.4054651081081644, 'C': 1.4054651081081644, 'B': 1.0})}
 
     """
 
@@ -60,4 +59,6 @@ def _idf(cls, list_of_tokens: list) -> dict:
         stats = cls._df(list_of_tokens)
         for key, val in stats.most_common():
             stats[key] = np.log((1 + num_docs) / (1 + val)) + 1
-        return stats
\ No newline at end of file
+        return stats
+
+
diff --git a/matchzoo/preprocessors/units/padding_left_and_right.py b/matchzoo/preprocessors/units/padding_left_and_right.py
index 2290f91..10894dc 100644
--- a/matchzoo/preprocessors/units/padding_left_and_right.py
+++ b/matchzoo/preprocessors/units/padding_left_and_right.py
@@ -9,36 +9,18 @@ class PaddingLeftAndRight(StatefulUnit):
     :param oov_value: The string value for the out-of-vocabulary terms.
 
     Examples:
-        >>> vocab = Vocabulary(pad_value='[PAD]', oov_value='[OOV]')
-        >>> vocab.fit(['A', 'B', 'C', 'D', 'E'])
-        >>> term_index = vocab.state['term_index']
-        >>> term_index  # doctest: +SKIP
-        {'[PAD]': 0, '[OOV]': 1, 'D': 2, 'A': 3, 'B': 4, 'C': 5, 'E': 6}
-        >>> index_term = vocab.state['index_term']
-        >>> index_term  # doctest: +SKIP
-        {0: '[PAD]', 1: '[OOV]', 2: 'D', 3: 'A', 4: 'B', 5: 'C', 6: 'E'}
-
-        >>> term_index['out-of-vocabulary-term']
-        1
-        >>> index_term[0]
-        '[PAD]'
-        >>> index_term[42]
-        Traceback (most recent call last):
-            ...
-        KeyError: 42
-        >>> a_index = term_index['A']
-        >>> c_index = term_index['C']
-        >>> vocab.transform(['C', 'A', 'C']) == [c_index, a_index, c_index]
-        True
-        >>> vocab.transform(['C', 'A', '[OOV]']) == [c_index, a_index, 1]
-        True
-        >>> indices = vocab.transform(list('ABCDDZZZ'))
-        >>> ' '.join(vocab.state['index_term'][i] for i in indices)
-        'A B C D D [OOV] [OOV] [OOV]'
+        >>> unit = PaddingLeftAndRight(
+        ...     left_padding_num=5, right_padding_num=3, pad_value=0)
+        >>> unit.transform([3, 1, 4, 1, 5])
+        [0, 0, 0, 0, 0, 3, 1, 4, 1, 5, 0, 0, 0]
 
     """
 
-    def __init__(self, left_padding_num: int, right_padding_num: int, pad_value: str or int = 0):
+    def __init__(
+            self,
+            left_padding_num: int,
+            right_padding_num: int,
+            pad_value: str or int = 0):
         """Vocabulary unit initializer."""
         super().__init__()
         self._context["left_padding_num"] = left_padding_num
@@ -46,11 +28,12 @@ def __init__(self, left_padding_num: int, right_padding_num: int, pad_value: str
         self._context["pad_value"] = pad_value
 
     def fit(self, tokens: list):
-        """ Do nothing. """
+        """Do nothing."""
         pass
 
     def transform(self, input_: list) -> list:
         """Padding on left and right."""
-        return [self._context["pad_value"]] * self._context["left_padding_num"] + \
-               input_ + \
-               [self._context["pad_value"]] * self._context["right_padding_num"]
\ No newline at end of file
+        return \
+            [self._context["pad_value"]] * self._context["left_padding_num"] \
+            + input_ \
+            + [self._context["pad_value"]] * self._context["right_padding_num"]