Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flake8 Fixes (Part1) #1550

Merged
merged 22 commits into from
Oct 19, 2021
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ jobs:
pip install tox

- name: Run flake8
# TODO: re-enable this flake8 block (turned off to get a draft of the pipeline infrastructure)
continue-on-error: true
run: |
tox -e flake8

Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/pr-gate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ jobs:
pip install tox

- name: Run flake8
# TODO: re-enable this flake8 block (turned off to get a draft of the pipeline infrastructure)
continue-on-error: true
run: |
tox -e flake8

Expand Down
6 changes: 3 additions & 3 deletions contrib/sarplus/python/pysarplus/SARPlus.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ def fit(self, df):
query = self.f(
"""
SELECT
{col_user}, {col_item},
{col_user}, {col_item},
SUM({col_rating} * EXP(-log(2) * (latest_timestamp - CAST({col_timestamp} AS long)) / ({time_decay_coefficient} * 3600 * 24))) as {col_rating}
FROM {prefix}df_train_input,
(SELECT CAST(MAX({col_timestamp}) AS long) latest_timestamp FROM {prefix}df_train_input)
GROUP BY {col_user}, {col_item}
CLUSTER BY {col_user}
GROUP BY {col_user}, {col_item}
CLUSTER BY {col_user}
"""
)

Expand Down
4 changes: 1 addition & 3 deletions recommenders/datasets/amazon_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,15 @@
# Licensed under the MIT License.

import os
import re
import shutil
import warnings
import pandas as pd
import gzip
import random
import logging
import _pickle as cPickle

from recommenders.utils.constants import SEED
from recommenders.datasets.download_utils import maybe_download, download_path
from recommenders.datasets.download_utils import maybe_download


random.seed(SEED)
Expand Down
16 changes: 8 additions & 8 deletions recommenders/datasets/cosmos_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

def find_collection(client, dbid, id):
"""Find whether or not a CosmosDB collection exists.

Args:
client (object): A pydocumentdb client object.
dbid (str): Database ID.
id (str): Collection ID.

Returns:
bool: True if the collection exists, False otherwise.
"""
Expand All @@ -32,12 +32,12 @@ def find_collection(client, dbid, id):

def read_collection(client, dbid, id):
"""Read a CosmosDB collection.

Args:
client (object): A pydocumentdb client object.
dbid (str): Database ID.
id (str): Collection ID.

Returns:
object: A collection.
"""
Expand All @@ -55,11 +55,11 @@ def read_collection(client, dbid, id):

def read_database(client, id):
"""Read a CosmosDB database.

Args:
client (object): A pydocumentdb client object.
id (str): Database ID.

Returns:
object: A database.
"""
Expand All @@ -76,11 +76,11 @@ def read_database(client, id):

def find_database(client, id):
"""Find whether or not a CosmosDB database exists.

Args:
client (object): A pydocumentdb client object.
id (str): Database ID.

Returns:
bool: True if the database exists, False otherwise.
"""
Expand Down
33 changes: 16 additions & 17 deletions recommenders/datasets/covid_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import json
import numpy as np
import pandas as pd
import requests
Expand All @@ -22,7 +21,7 @@ def load_pandas_df(
azure_storage_sas_token (str): Azure storage SAS token.
container_name (str): Azure storage container name.
metadata_filename (str): Name of file containing top-level metadata for the dataset.

Returns:
metadata (pandas.DataFrame): Metadata dataframe.
"""
Expand All @@ -39,14 +38,14 @@ def load_pandas_df(

def remove_duplicates(df, cols):
""" Remove duplicated entries.

Args:
df (pd.DataFrame): Pandas dataframe.
cols (list of str): Name of columns in which to look for duplicates.

Returns:
df (pandas.DataFrame): Pandas dataframe with duplicate rows dropped.

"""
for col in cols:
# Reset index
Expand All @@ -63,14 +62,14 @@ def remove_duplicates(df, cols):

def remove_nan(df, cols):
""" Remove rows with NaN values in specified column.

Args:
df (pandas.DataFrame): Pandas dataframe.
cols (list of str): Name of columns in which to look for NaN.

Returns:
df (pandas.DataFrame): Pandas dataframe with invalid rows dropped.

"""
for col in cols:
# Convert any empty string cells to nan
Expand All @@ -84,10 +83,10 @@ def remove_nan(df, cols):

def clean_dataframe(df):
""" Clean up the dataframe.

Args:
df (pandas.DataFrame): Pandas dataframe.

Returns:
df (pandas.DataFrame): Cleaned pandas dataframe.
"""
Expand All @@ -104,13 +103,13 @@ def clean_dataframe(df):


def retrieve_text(
entry,
entry,
container_name,
azure_storage_account_name="azureopendatastorage",
azure_storage_sas_token="",
):
""" Retrieve body text from article of interest.

Args:
entry (pd.Series): A single row from the dataframe (df.iloc[n]).
container_name (str): Azure storage container name.
Expand Down Expand Up @@ -142,13 +141,13 @@ def retrieve_text(


def get_public_domain_text(
df,
df,
container_name,
azure_storage_account_name="azureopendatastorage",
azure_storage_sas_token="",
):
""" Get all public domain text.

Args:
df (pandas.DataFrame): Metadata dataframe for public domain text.
container_name (str): Azure storage container name.
Expand All @@ -164,9 +163,9 @@ def get_public_domain_text(
# Add in full_text
df["full_text"] = df.apply(
lambda row: retrieve_text(
row,
container_name,
azure_storage_account_name,
row,
container_name,
azure_storage_account_name,
azure_storage_sas_token
), axis=1
)
Expand Down
8 changes: 4 additions & 4 deletions recommenders/datasets/criteo.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def load_spark_df(
try:
# Driver node's file path
node_path = "file:" + filepath
## needs to be on dbfs to load
# needs to be on dbfs to load
dbutils.fs.cp(node_path, dbfs_datapath, recurse=True)
path = dbfs_datapath
except:
Expand Down Expand Up @@ -172,13 +172,13 @@ def get_spark_schema(header=DEFAULT_HEADER):
Returns:
pyspark.sql.types.StructType: Spark schema.
"""
## create schema
# create schema
schema = StructType()
## do label + ints
# do label + ints
n_ints = 14
for i in range(n_ints):
schema.add(StructField(header[i], IntegerType()))
## do categoricals
# do categoricals
for i in range(26):
schema.add(StructField(header[i + n_ints], StringType()))
return schema
1 change: 0 additions & 1 deletion recommenders/datasets/download_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from tempfile import TemporaryDirectory
from tqdm import tqdm
from retrying import retry
import logging


log = logging.getLogger(__name__)
Expand Down
1 change: 0 additions & 1 deletion recommenders/datasets/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
FloatType,
LongType
)
from pyspark.sql.functions import concat_ws, col
except ImportError:
pass # so the environment without spark doesn't break

Expand Down
1 change: 0 additions & 1 deletion recommenders/datasets/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
DEFAULT_TIMESTAMP_COL,
DEFAULT_PREDICTION_COL,
)

Expand Down
1 change: 0 additions & 1 deletion recommenders/datasets/split_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import pandas as pd
import numpy as np
import math

Expand Down
10 changes: 5 additions & 5 deletions recommenders/datasets/wikidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def find_wikidata_id(name, limit=1, session=None):
try:
response = session.get(API_URL_WIKIPEDIA, params=params)
page_id = response.json()["query"]["search"][0]["pageid"]
except Exception as e:
except Exception:
# TODO: distinguish between connection error and entity not found
logger.error("ENTITY NOT FOUND")
return "entityNotFound"
Expand All @@ -79,7 +79,7 @@ def find_wikidata_id(name, limit=1, session=None):
entity_id = response.json()["query"]["pages"][str(page_id)]["pageprops"][
"wikibase_item"
]
except Exception as e:
except Exception:
# TODO: distinguish between connection error and entity not found
logger.error("ENTITY NOT FOUND")
return "entityNotFound"
Expand Down Expand Up @@ -138,7 +138,7 @@ def query_entity_links(entity_id, session=None):
data = session.get(
API_URL_WIKIDATA, params=dict(query=query, format="json")
).json()
except Exception as e:
except Exception as e: # noqa: F841
logger.error("ENTITY NOT FOUND")
return {}

Expand Down Expand Up @@ -184,7 +184,7 @@ def query_entity_description(entity_id, session=None):
PREFIX schema: <http://schema.org/>

SELECT ?o
WHERE
WHERE
{
wd:"""
+ entity_id
Expand All @@ -199,7 +199,7 @@ def query_entity_description(entity_id, session=None):
try:
r = session.get(API_URL_WIKIDATA, params=dict(query=query, format="json"))
description = r.json()["results"]["bindings"][0]["o"]["value"]
except Exception as e:
except Exception as e: # noqa: F841
logger.error("DESCRIPTION NOT FOUND")
return "descriptionNotFound"

Expand Down
Loading