recommenders-team · laserprec · Oct 19, 2021 · Oct 15, 2021 · Oct 15, 2021 · Oct 15, 2021
@@ -73,8 +73,6 @@ jobs:
         pip install tox
 
     - name: Run flake8
-      # TODO: re-enable this flake8 block (turned off to get a draft of the pipeline infrastructure)
-      continue-on-error: true
       run: |
        tox -e flake8
 

@@ -62,8 +62,6 @@ jobs:
         pip install tox
 
     - name: Run flake8
-      # TODO: re-enable this flake8 block (turned off to get a draft of the pipeline infrastructure)
-      continue-on-error: true
       run: |
        tox -e flake8
 

@@ -93,12 +93,12 @@ def fit(self, df):
             query = self.f(
                 """
             SELECT
-                 {col_user}, {col_item}, 
+                 {col_user}, {col_item},
                  SUM({col_rating} * EXP(-log(2) * (latest_timestamp - CAST({col_timestamp} AS long)) / ({time_decay_coefficient} * 3600 * 24))) as {col_rating}
             FROM {prefix}df_train_input,
                  (SELECT CAST(MAX({col_timestamp}) AS long) latest_timestamp FROM {prefix}df_train_input)
-            GROUP BY {col_user}, {col_item} 
-            CLUSTER BY {col_user} 
+            GROUP BY {col_user}, {col_item}
+            CLUSTER BY {col_user}
             """
             )
 

@@ -2,17 +2,15 @@
 # Licensed under the MIT License.
 
 import os
-import re
 import shutil
-import warnings
 import pandas as pd
 import gzip
 import random
 import logging
 import _pickle as cPickle
 
 from recommenders.utils.constants import SEED
-from recommenders.datasets.download_utils import maybe_download, download_path
+from recommenders.datasets.download_utils import maybe_download
 
 
 random.seed(SEED)

@@ -5,12 +5,12 @@
 
 def find_collection(client, dbid, id):
     """Find whether or not a CosmosDB collection exists.
-    
+
     Args:
         client (object): A pydocumentdb client object.
         dbid (str): Database ID.
         id (str): Collection ID.
-    
+
     Returns:
         bool: True if the collection exists, False otherwise.
     """
@@ -32,12 +32,12 @@ def find_collection(client, dbid, id):
 
 def read_collection(client, dbid, id):
     """Read a CosmosDB collection.
-    
+
     Args:
         client (object): A pydocumentdb client object.
         dbid (str): Database ID.
         id (str): Collection ID.
-    
+
     Returns:
         object: A collection.
     """
@@ -55,11 +55,11 @@ def read_collection(client, dbid, id):
 
 def read_database(client, id):
     """Read a CosmosDB database.
-    
+
     Args:
         client (object): A pydocumentdb client object.
         id (str): Database ID.
-    
+
     Returns:
         object: A database.
     """
@@ -76,11 +76,11 @@ def read_database(client, id):
 
 def find_database(client, id):
     """Find whether or not a CosmosDB database exists.
-    
+
     Args:
         client (object): A pydocumentdb client object.
         id (str): Database ID.
-    
+
     Returns:
         bool: True if the database exists, False otherwise.
     """

@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import json
 import numpy as np
 import pandas as pd
 import requests
@@ -22,7 +21,7 @@ def load_pandas_df(
         azure_storage_sas_token (str): Azure storage SAS token.
         container_name (str): Azure storage container name.
         metadata_filename (str): Name of file containing top-level metadata for the dataset.
-    
+
     Returns:
         metadata (pandas.DataFrame): Metadata dataframe.
     """
@@ -39,14 +38,14 @@ def load_pandas_df(
 
 def remove_duplicates(df, cols):
     """ Remove duplicated entries.
-    
+
     Args:
         df (pd.DataFrame): Pandas dataframe.
         cols (list of str): Name of columns in which to look for duplicates.
-    
+
     Returns:
         df (pandas.DataFrame): Pandas dataframe with duplicate rows dropped.
-    
+
     """
     for col in cols:
         # Reset index
@@ -63,14 +62,14 @@ def remove_duplicates(df, cols):
 
 def remove_nan(df, cols):
     """ Remove rows with NaN values in specified column.
-    
+
     Args:
         df (pandas.DataFrame): Pandas dataframe.
         cols (list of str): Name of columns in which to look for NaN.
-    
+
     Returns:
         df (pandas.DataFrame): Pandas dataframe with invalid rows dropped.
-    
+
     """
     for col in cols:
         # Convert any empty string cells to nan
@@ -84,10 +83,10 @@ def remove_nan(df, cols):
 
 def clean_dataframe(df):
     """ Clean up the dataframe.
-    
+
     Args:
         df (pandas.DataFrame): Pandas dataframe.
-    
+
     Returns:
         df (pandas.DataFrame): Cleaned pandas dataframe.
     """
@@ -104,13 +103,13 @@ def clean_dataframe(df):
 
 
 def retrieve_text(
-        entry, 
+        entry,
         container_name,
         azure_storage_account_name="azureopendatastorage",
         azure_storage_sas_token="",
 ):
     """ Retrieve body text from article of interest.
-    
+
     Args:
         entry (pd.Series): A single row from the dataframe (df.iloc[n]).
         container_name (str): Azure storage container name.
@@ -142,13 +141,13 @@ def retrieve_text(
 
 
 def get_public_domain_text(
-    df, 
+    df,
     container_name,
     azure_storage_account_name="azureopendatastorage",
     azure_storage_sas_token="",
 ):
     """ Get all public domain text.
-    
+
     Args:
         df (pandas.DataFrame): Metadata dataframe for public domain text.
         container_name (str): Azure storage container name.
@@ -164,9 +163,9 @@ def get_public_domain_text(
     # Add in full_text
     df["full_text"] = df.apply(
         lambda row: retrieve_text(
-            row, 
-            container_name, 
-            azure_storage_account_name, 
+            row,
+            container_name,
+            azure_storage_account_name,
             azure_storage_sas_token
         ), axis=1
     )

@@ -107,7 +107,7 @@ def load_spark_df(
             try:
                 # Driver node's file path
                 node_path = "file:" + filepath
-                ## needs to be on dbfs to load
+                # needs to be on dbfs to load
                 dbutils.fs.cp(node_path, dbfs_datapath, recurse=True)
                 path = dbfs_datapath
             except:
@@ -172,13 +172,13 @@ def get_spark_schema(header=DEFAULT_HEADER):
     Returns:
         pyspark.sql.types.StructType: Spark schema.
     """
-    ## create schema
+    # create schema
     schema = StructType()
-    ## do label + ints
+    # do label + ints
     n_ints = 14
     for i in range(n_ints):
         schema.add(StructField(header[i], IntegerType()))
-    ## do categoricals
+    # do categoricals
     for i in range(26):
         schema.add(StructField(header[i + n_ints], StringType()))
     return schema
@@ -10,7 +10,6 @@
 from tempfile import TemporaryDirectory
 from tqdm import tqdm
 from retrying import retry
-import logging
 
 
 log = logging.getLogger(__name__)

@@ -30,7 +30,6 @@
         FloatType,
         LongType
     )
-    from pyspark.sql.functions import concat_ws, col
 except ImportError:
     pass  # so the environment without spark doesn't break
 

@@ -13,7 +13,6 @@
     DEFAULT_USER_COL,
     DEFAULT_ITEM_COL,
     DEFAULT_RATING_COL,
-    DEFAULT_TIMESTAMP_COL,
     DEFAULT_PREDICTION_COL,
 )
 

@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import pandas as pd
 import numpy as np
 import math
 

@@ -61,7 +61,7 @@ def find_wikidata_id(name, limit=1, session=None):
     try:
         response = session.get(API_URL_WIKIPEDIA, params=params)
         page_id = response.json()["query"]["search"][0]["pageid"]
-    except Exception as e:
+    except Exception:
         # TODO: distinguish between connection error and entity not found
         logger.error("ENTITY NOT FOUND")
         return "entityNotFound"
@@ -79,7 +79,7 @@ def find_wikidata_id(name, limit=1, session=None):
         entity_id = response.json()["query"]["pages"][str(page_id)]["pageprops"][
             "wikibase_item"
         ]
-    except Exception as e:
+    except Exception:
         # TODO: distinguish between connection error and entity not found
         logger.error("ENTITY NOT FOUND")
         return "entityNotFound"
@@ -138,7 +138,7 @@ def query_entity_links(entity_id, session=None):
         data = session.get(
             API_URL_WIKIDATA, params=dict(query=query, format="json")
         ).json()
-    except Exception as e:
+    except Exception as e:  # noqa: F841
         logger.error("ENTITY NOT FOUND")
         return {}
 
@@ -184,7 +184,7 @@ def query_entity_description(entity_id, session=None):
     PREFIX schema: <http://schema.org/>
 
     SELECT ?o
-    WHERE 
+    WHERE
     {
       wd:"""
         + entity_id
@@ -199,7 +199,7 @@ def query_entity_description(entity_id, session=None):
     try:
         r = session.get(API_URL_WIKIDATA, params=dict(query=query, format="json"))
         description = r.json()["results"]["bindings"][0]["o"]["value"]
-    except Exception as e:
+    except Exception as e:  # noqa: F841
         logger.error("DESCRIPTION NOT FOUND")
         return "descriptionNotFound"