diff --git a/.travis.yml b/.travis.yml index ce6b8ee6..92a7ff86 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,13 @@ install: before_script: - 'flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics' script: - - 'funnel -s demo/settings.yaml' + - 'funnel -s demo/settings.yaml -log-level DEBUG' + # NOTE: we might want to make below search somewhere else so it isn't + # so very specific. + - 'funnel -s demo/settings.yaml -kw Python Data Scientist PHD AI -ps WA -c Seattle -l USA_ENGLISH -log-level DEBUG' - 'pytest --cov=jobfunnel --cov-report=xml' + # - './tests/verify-artifacts.sh' TODO: verify that JSON exist and are good + # - './tests/verify_time.sh' TODO: some way of verifying execution time after_success: - 'bash <(curl -s https://codecov.io/bash)' + # - './demo/gen_call_graphs.sh' TODO: some way of showing .dot on GitHub? diff --git a/jobfunnel/backend/scrapers/base.py b/jobfunnel/backend/scrapers/base.py index e229333b..75df55f9 100644 --- a/jobfunnel/backend/scrapers/base.py +++ b/jobfunnel/backend/scrapers/base.py @@ -1,14 +1,11 @@ """The base scraper class to be used for all web-scraping emitting Job objects """ -import logging -import os import random -import sys from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor, as_completed from multiprocessing import Lock, Manager -from time import sleep, time -from typing import Any, Dict, List, Optional, Tuple, Union +from time import sleep +from typing import Any, Dict, List, Optional from bs4 import BeautifulSoup from requests import Session @@ -23,8 +20,10 @@ from jobfunnel.resources import (MAX_CPU_WORKERS, USER_AGENT_LIST, JobField, Locale) +# pylint: disable=using-constant-test,unused-import if False: # or typing.TYPE_CHECKING if python3.5.3+ from jobfunnel.config import JobFunnelConfigManager +# pylint: enable=using-constant-test,unused-import class BaseScraper(ABC, Logger): diff --git a/jobfunnel/backend/scrapers/glassdoor.py b/jobfunnel/backend/scrapers/glassdoor.py index a9b09e91..b726b570 100644 --- a/jobfunnel/backend/scrapers/glassdoor.py +++ b/jobfunnel/backend/scrapers/glassdoor.py @@ -1,28 +1,27 @@ """Scraper for www.glassdoor.X FIXME: this is currently unable to get past page 1 of job results. """ -import logging import re from abc import abstractmethod from concurrent.futures import ThreadPoolExecutor, wait -from datetime import date, datetime, timedelta from math import ceil -from time import sleep, time -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Tuple, Union from bs4 import BeautifulSoup from requests import Session -from jobfunnel.backend import Job, JobStatus +from jobfunnel.backend import Job from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper, BaseUSAEngScraper) from jobfunnel.backend.tools import get_webdriver from jobfunnel.backend.tools.filters import JobFilter from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str -from jobfunnel.resources import MAX_CPU_WORKERS, JobField, Locale +from jobfunnel.resources import MAX_CPU_WORKERS, JobField +# pylint: disable=using-constant-test,unused-import if False: # or typing.TYPE_CHECKING if python3.5.3+ from jobfunnel.config import JobFunnelConfigManager +# pylint: enable=using-constant-test,unused-import MAX_GLASSDOOR_LOCATIONS_TO_RETURN = 10 diff --git a/jobfunnel/backend/scrapers/indeed.py b/jobfunnel/backend/scrapers/indeed.py index 58dd4708..1506e26a 100644 --- a/jobfunnel/backend/scrapers/indeed.py +++ b/jobfunnel/backend/scrapers/indeed.py @@ -1,27 +1,24 @@ """Scraper designed to get jobs from www.indeed.X """ -import logging import re -from abc import abstractmethod from concurrent.futures import ThreadPoolExecutor, wait -from datetime import date, datetime, timedelta from math import ceil -from time import sleep, time -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional from bs4 import BeautifulSoup from requests import Session -from jobfunnel.backend import Job, JobStatus +from jobfunnel.backend import Job from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper, BaseUSAEngScraper) from jobfunnel.backend.tools.filters import JobFilter from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str -from jobfunnel.resources import MAX_CPU_WORKERS, JobField, Locale +from jobfunnel.resources import MAX_CPU_WORKERS, JobField +# pylint: disable=using-constant-test,unused-import if False: # or typing.TYPE_CHECKING if python3.5.3+ from jobfunnel.config import JobFunnelConfigManager - +# pylint: enable=using-constant-test,unused-import ID_REGEX = re.compile(r'id=\"sj_([a-zA-Z0-9]*)\"') MAX_RESULTS_PER_INDEED_PAGE = 50 @@ -210,7 +207,7 @@ def _get_search_url(self, method: Optional[str] = 'get') -> str: self.config.search_config.domain, self.query, self.config.search_config.city.replace(' ', '+',), - self.config.search_config.province_or_state, + self.config.search_config.province_or_state.upper(), self._convert_radius(self.config.search_config.radius), self.max_results_per_page, int(self.config.search_config.return_similar_results) diff --git a/jobfunnel/backend/scrapers/monster.py b/jobfunnel/backend/scrapers/monster.py index 90ab35b2..585ba9b5 100644 --- a/jobfunnel/backend/scrapers/monster.py +++ b/jobfunnel/backend/scrapers/monster.py @@ -1,26 +1,24 @@ """Scrapers for www.monster.X """ -import logging import re from abc import abstractmethod -from concurrent.futures import ThreadPoolExecutor, wait -from datetime import date, datetime, timedelta from math import ceil -from time import sleep, time -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional from bs4 import BeautifulSoup from requests import Session -from jobfunnel.backend import Job, JobStatus +from jobfunnel.backend import Job from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper, BaseUSAEngScraper) from jobfunnel.backend.tools.filters import JobFilter from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str -from jobfunnel.resources import MAX_CPU_WORKERS, JobField, Locale +from jobfunnel.resources import JobField +# pylint: disable=using-constant-test,unused-import if False: # or typing.TYPE_CHECKING if python3.5.3+ from jobfunnel.config import JobFunnelConfigManager +# pylint: enable=using-constant-test,unused-import MAX_RESULTS_PER_MONSTER_PAGE = 25 diff --git a/jobfunnel/backend/tools/delay.py b/jobfunnel/backend/tools/delay.py index 8fbfc07e..2ee45cbb 100644 --- a/jobfunnel/backend/tools/delay.py +++ b/jobfunnel/backend/tools/delay.py @@ -2,13 +2,11 @@ """ from math import ceil, log, sqrt from random import uniform -from time import time -from typing import Dict, List, Union +from typing import List, Union from numpy import arange -from scipy.special import expit +from scipy.special import expit # pylint: disable=no-name-in-module -from jobfunnel.backend import Job from jobfunnel.config import DelayConfig from jobfunnel.resources import DelayAlgorithm diff --git a/jobfunnel/backend/tools/tools.py b/jobfunnel/backend/tools/tools.py index 97992cca..dd850ac6 100644 --- a/jobfunnel/backend/tools/tools.py +++ b/jobfunnel/backend/tools/tools.py @@ -35,11 +35,13 @@ def get_logger(logger_name: str, level: int, file_path: str, """ logger = logging.getLogger(logger_name) logger.setLevel(level) - logging.basicConfig(filename=file_path, level=level) formatter = logging.Formatter(message_format) stdout_handler = logging.StreamHandler(sys.stdout) stdout_handler.setFormatter(formatter) logger.addHandler(stdout_handler) + file_handler = logging.FileHandler(file_path) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) return logger diff --git a/jobfunnel/config/cli.py b/jobfunnel/config/cli.py index 1bf24b8f..eca98a46 100644 --- a/jobfunnel/config/cli.py +++ b/jobfunnel/config/cli.py @@ -1,15 +1,12 @@ """Configuration parsing module for CLI --> JobFunnelConfigManager """ import argparse -import logging import os -from typing import Any, Dict, List import yaml -from jobfunnel.config import (SETTINGS_YAML_SCHEMA, DelayConfig, - JobFunnelConfigManager, ProxyConfig, - SearchConfig, SettingsValidator) +from jobfunnel.config import (DelayConfig, JobFunnelConfigManager, + ProxyConfig, SearchConfig, SettingsValidator) from jobfunnel.resources import (LOG_LEVEL_NAMES, DelayAlgorithm, Locale, Provider) from jobfunnel.resources.defaults import * @@ -85,14 +82,13 @@ def parse_cli(): ) parser.add_argument( - '-lf', - dest='log_file', + '-log-file', type=str, help=f'path to logging file. defaults to {DEFAULT_LOG_FILE}' ) parser.add_argument( - '--log-level', + '-log-level', type=str, default=DEFAULT_LOG_LEVEL_NAME, choices=LOG_LEVEL_NAMES, diff --git a/jobfunnel/config/manager.py b/jobfunnel/config/manager.py index ba9ae255..e9b6d69d 100644 --- a/jobfunnel/config/manager.py +++ b/jobfunnel/config/manager.py @@ -8,8 +8,10 @@ from jobfunnel.config import BaseConfig, DelayConfig, ProxyConfig, SearchConfig from jobfunnel.resources import BS4_PARSER +# pylint: disable=using-constant-test,unused-import if False: # or typing.TYPE_CHECKING if python3.5.3+ - from jobfunnel.backend.scrapers.base import BaseScraper + from jobfunnel.config import JobFunnelConfigManager +# pylint: enable=using-constant-test,unused-import class JobFunnelConfigManager(BaseConfig): diff --git a/jobfunnel/config/settings.py b/jobfunnel/config/settings.py index 21930736..ee0670bb 100644 --- a/jobfunnel/config/settings.py +++ b/jobfunnel/config/settings.py @@ -1,7 +1,6 @@ """Settings YAML Schema w/ validator """ import ipaddress -import logging from cerberus import Validator diff --git a/jobfunnel/resources/defaults.py b/jobfunnel/resources/defaults.py index 768febcf..8ac22f9f 100644 --- a/jobfunnel/resources/defaults.py +++ b/jobfunnel/resources/defaults.py @@ -35,7 +35,7 @@ DEFAULT_RECOVER = False DEFAULT_RETURN_SIMILAR_RESULTS = False DEFAULT_SAVE_DUPLICATES = False -DEFAULT_RANDOM_DELAY= False +DEFAULT_RANDOM_DELAY = False DEFAULT_RANDOM_CONVERGING_DELAY = False # Defaults we use from localization, the scraper can always override it.