diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 00000000..27596b1a
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,9 @@
+coverage:
+ # FIXME: Set these back to automatic once we up coverage more
+ status:
+ patch:
+ default:
+ target: 30%
+ project:
+ default:
+ target: 30%
diff --git a/.gitignore b/.gitignore
index 4fc43cd6..773667c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,12 @@
-data/
+# Outputs
*.csv
-demo/data/
+demo_job_search_results
+
+# IntelliJ/Pycharm configs
+.idea/
+
+# GraphViz
+*.dot
# Byte-compiled / optimized / DLL files
__pycache__/
@@ -177,4 +183,4 @@ $RECYCLE.BIN/
.com.apple.timemachine.donotpresent
# VScode trash
-.vscode/
\ No newline at end of file
+.vscode/
diff --git a/.idea/JobFunnel.iml b/.idea/JobFunnel.iml
deleted file mode 100644
index 29089b45..00000000
--- a/.idea/JobFunnel.iml
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index bd43d393..00000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-
-
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 1c2b45cf..00000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 94a25f7f..00000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index 8af05521..252befdc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,13 +1,20 @@
language: python
python:
- - '3.6.9'
+ - '3.8.0'
install:
+ - 'pip install -e .'
- 'pip install flake8 pipenv pytest-cov pytest-mock'
- - 'pipenv sync'
- 'python -m nltk.downloader stopwords'
-before_script: 'flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics'
+before_script:
+ - 'flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics'
script:
- - 'python -m jobfunnel -s demo/settings.yaml -o demo/'
+ # Run CANADA_ENGLISH demo by settings YAML
+ - 'funnel load -s demo/settings.yaml -log-level DEBUG'
+ # Run an american search by CLI
+ - 'funnel inline -kw Python Data Scientist PHD AI -ps WA -c Seattle -l USA_ENGLISH -log-level DEBUG -csv demo_job_search_results/demo_search.csv -cache demo_job_search_results/cache2 -blf demo_job_search_results/demo_block_list.json -dl demo_job_search_results/demo_duplicates_list.json -log-file demo_job_search_results/log.log'
- 'pytest --cov=jobfunnel --cov-report=xml'
+ # - './tests/verify-artifacts.sh' TODO: verify that JSON exist and are good
+ # - './tests/verify_time.sh' TODO: some way of verifying execution time
after_success:
- 'bash <(curl -s https://codecov.io/bash)'
+ # - './demo/gen_call_graphs.sh' TODO: some way of showing .dot on GitHub?
diff --git a/LICENSE b/LICENSE
index d8d1a92c..f6ff2f97 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2017 Paul McInnis
+Copyright (c) 2020 Paul McInnis
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/MANIFEST.in b/MANIFEST.in
index 32ef57fe..4a09a320 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,4 @@
-include jobfunnel/config/settings.yaml
-include jobfunnel/text/user_agent_list.txt
+include jobfunnel/demo/settings.yaml
+include jobfunnel/resources/user_agent_list.txt
+include readme.md
+include LICENSE
diff --git a/Pipfile b/Pipfile
deleted file mode 100644
index 538eb902..00000000
--- a/Pipfile
+++ /dev/null
@@ -1,14 +0,0 @@
-[[source]]
-name = "pypi"
-url = "https://pypi.org/simple"
-verify_ssl = true
-
-[dev-packages]
-
-[packages]
-jobfunnel = {path = ".",editable = true}
-selenium = "*"
-webdriver_manager = "*"
-
-[requires]
-python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
deleted file mode 100644
index a17e661e..00000000
--- a/Pipfile.lock
+++ /dev/null
@@ -1,375 +0,0 @@
-{
- "_meta": {
- "hash": {
- "sha256": "0840194ad12b002f72da2e91c7102bbd184cbf167cd2bda40f6de3db105d9928"
- },
- "pipfile-spec": 6,
- "requires": {
- "python_version": "3.6"
- },
- "sources": [
- {
- "name": "pypi",
- "url": "https://pypi.org/simple",
- "verify_ssl": true
- }
- ]
- },
- "default": {
- "attrs": {
- "hashes": [
- "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
- "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"
- ],
- "version": "==19.3.0"
- },
- "beautifulsoup4": {
- "hashes": [
- "sha256:594ca51a10d2b3443cbac41214e12dbb2a1cd57e1a7344659849e2e20ba6a8d8",
- "sha256:a4bbe77fd30670455c5296242967a123ec28c37e9702a8a81bd2f20a4baf0368",
- "sha256:d4e96ac9b0c3a6d3f0caae2e4124e6055c5dcafde8e2f831ff194c104f0775a0"
- ],
- "version": "==4.9.0"
- },
- "certifi": {
- "hashes": [
- "sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304",
- "sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519"
- ],
- "version": "==2020.4.5.1"
- },
- "chardet": {
- "hashes": [
- "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
- "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
- ],
- "version": "==3.0.4"
- },
- "click": {
- "hashes": [
- "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a",
- "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"
- ],
- "version": "==7.1.2"
- },
- "colorama": {
- "hashes": [
- "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff",
- "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"
- ],
- "version": "==0.4.3"
- },
- "configparser": {
- "hashes": [
- "sha256:2ca44140ee259b5e3d8aaf47c79c36a7ab0d5e94d70bd4105c03ede7a20ea5a1",
- "sha256:cffc044844040c7ce04e9acd1838b5f2e5fa3170182f6fda4d2ea8b0099dbadd"
- ],
- "version": "==5.0.0"
- },
- "crayons": {
- "hashes": [
- "sha256:50e5fa729d313e2c607ae8bf7b53bb487652e10bd8e7a1e08c4bc8bf62755ffc",
- "sha256:8c9e4a3a607bc10e9a9140d496ecd16c6805088dd16c852c378f1f1d5db7aeb6"
- ],
- "version": "==0.3.0"
- },
- "idna": {
- "hashes": [
- "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb",
- "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"
- ],
- "version": "==2.9"
- },
- "importlib-metadata": {
- "hashes": [
- "sha256:2a688cbaa90e0cc587f1df48bdc97a6eadccdcd9c35fb3f976a09e3b5016d90f",
- "sha256:34513a8a0c4962bc66d35b359558fd8a5e10cd472d37aec5f66858addef32c1e"
- ],
- "markers": "python_version < '3.8'",
- "version": "==1.6.0"
- },
- "jobfunnel": {
- "editable": true,
- "path": "."
- },
- "joblib": {
- "hashes": [
- "sha256:0630eea4f5664c463f23fbf5dcfc54a2bc6168902719fa8e19daf033022786c8",
- "sha256:bdb4fd9b72915ffb49fde2229ce482dd7ae79d842ed8c2b4c932441495af1403"
- ],
- "version": "==0.14.1"
- },
- "lxml": {
- "hashes": [
- "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd",
- "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c",
- "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081",
- "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f",
- "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261",
- "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a",
- "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9",
- "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a",
- "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb",
- "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60",
- "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128",
- "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a",
- "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717",
- "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89",
- "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72",
- "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8",
- "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3",
- "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7",
- "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8",
- "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77",
- "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1",
- "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15",
- "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679",
- "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012",
- "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6",
- "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc",
- "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca"
- ],
- "version": "==4.5.0"
- },
- "more-itertools": {
- "hashes": [
- "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c",
- "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507"
- ],
- "version": "==8.2.0"
- },
- "nltk": {
- "hashes": [
- "sha256:845365449cd8c5f9731f7cb9f8bd6fd0767553b9d53af9eb1b3abf7700936b35"
- ],
- "version": "==3.5"
- },
- "numpy": {
- "hashes": [
- "sha256:00d7b54c025601e28f468953d065b9b121ddca7fff30bed7be082d3656dd798d",
- "sha256:02ec9582808c4e48be4e93cd629c855e644882faf704bc2bd6bbf58c08a2a897",
- "sha256:0e6f72f7bb08f2f350ed4408bb7acdc0daba637e73bce9f5ea2b207039f3af88",
- "sha256:1be2e96314a66f5f1ce7764274327fd4fb9da58584eaff00b5a5221edefee7d6",
- "sha256:2466fbcf23711ebc5daa61d28ced319a6159b260a18839993d871096d66b93f7",
- "sha256:2b573fcf6f9863ce746e4ad00ac18a948978bb3781cffa4305134d31801f3e26",
- "sha256:3f0dae97e1126f529ebb66f3c63514a0f72a177b90d56e4bce8a0b5def34627a",
- "sha256:50fb72bcbc2cf11e066579cb53c4ca8ac0227abb512b6cbc1faa02d1595a2a5d",
- "sha256:57aea170fb23b1fd54fa537359d90d383d9bf5937ee54ae8045a723caa5e0961",
- "sha256:709c2999b6bd36cdaf85cf888d8512da7433529f14a3689d6e37ab5242e7add5",
- "sha256:7d59f21e43bbfd9a10953a7e26b35b6849d888fc5a331fa84a2d9c37bd9fe2a2",
- "sha256:904b513ab8fbcbdb062bed1ce2f794ab20208a1b01ce9bd90776c6c7e7257032",
- "sha256:96dd36f5cdde152fd6977d1bbc0f0561bccffecfde63cd397c8e6033eb66baba",
- "sha256:9933b81fecbe935e6a7dc89cbd2b99fea1bf362f2790daf9422a7bb1dc3c3085",
- "sha256:bbcc85aaf4cd84ba057decaead058f43191cc0e30d6bc5d44fe336dc3d3f4509",
- "sha256:dccd380d8e025c867ddcb2f84b439722cf1f23f3a319381eac45fd077dee7170",
- "sha256:e22cd0f72fc931d6abc69dc7764484ee20c6a60b0d0fee9ce0426029b1c1bdae",
- "sha256:ed722aefb0ebffd10b32e67f48e8ac4c5c4cf5d3a785024fdf0e9eb17529cd9d",
- "sha256:efb7ac5572c9a57159cf92c508aad9f856f1cb8e8302d7fdb99061dbe52d712c",
- "sha256:efdba339fffb0e80fcc19524e4fdbda2e2b5772ea46720c44eaac28096d60720",
- "sha256:f22273dd6a403ed870207b853a856ff6327d5cbce7a835dfa0645b3fc00273ec"
- ],
- "version": "==1.18.4"
- },
- "packaging": {
- "hashes": [
- "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3",
- "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752"
- ],
- "version": "==20.3"
- },
- "pluggy": {
- "hashes": [
- "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
- "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
- ],
- "version": "==0.13.1"
- },
- "py": {
- "hashes": [
- "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa",
- "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0"
- ],
- "version": "==1.8.1"
- },
- "pyparsing": {
- "hashes": [
- "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
- "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
- ],
- "version": "==2.4.7"
- },
- "pytest": {
- "hashes": [
- "sha256:0e5b30f5cb04e887b91b1ee519fa3d89049595f428c1db76e73bd7f17b09b172",
- "sha256:84dde37075b8805f3d1f392cc47e38a0e59518fb46a431cfdaf7cf1ce805f970"
- ],
- "version": "==5.4.1"
- },
- "python-dateutil": {
- "hashes": [
- "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
- "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
- ],
- "version": "==2.8.1"
- },
- "pyyaml": {
- "hashes": [
- "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97",
- "sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76",
- "sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2",
- "sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648",
- "sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf",
- "sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f",
- "sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2",
- "sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee",
- "sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d",
- "sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c",
- "sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a"
- ],
- "version": "==5.3.1"
- },
- "regex": {
- "hashes": [
- "sha256:021a0ae4d2baeeb60a3014805a2096cb329bd6d9f30669b7ad0da51a9cb73349",
- "sha256:04d6e948ef34d3eac133bedc0098364a9e635a7914f050edb61272d2ddae3608",
- "sha256:099568b372bda492be09c4f291b398475587d49937c659824f891182df728cdf",
- "sha256:0ff50843535593ee93acab662663cb2f52af8e31c3f525f630f1dc6156247938",
- "sha256:1b17bf37c2aefc4cac8436971fe6ee52542ae4225cfc7762017f7e97a63ca998",
- "sha256:1e2255ae938a36e9bd7db3b93618796d90c07e5f64dd6a6750c55f51f8b76918",
- "sha256:2bc6a17a7fa8afd33c02d51b6f417fc271538990297167f68a98cae1c9e5c945",
- "sha256:3ab5e41c4ed7cd4fa426c50add2892eb0f04ae4e73162155cd668257d02259dd",
- "sha256:3b059e2476b327b9794c792c855aa05531a3f3044737e455d283c7539bd7534d",
- "sha256:4df91094ced6f53e71f695c909d9bad1cca8761d96fd9f23db12245b5521136e",
- "sha256:5493a02c1882d2acaaf17be81a3b65408ff541c922bfd002535c5f148aa29f74",
- "sha256:5b741ecc3ad3e463d2ba32dce512b412c319993c1bb3d999be49e6092a769fb2",
- "sha256:652ab4836cd5531d64a34403c00ada4077bb91112e8bcdae933e2eae232cf4a8",
- "sha256:669a8d46764a09f198f2e91fc0d5acdac8e6b620376757a04682846ae28879c4",
- "sha256:73a10404867b835f1b8a64253e4621908f0d71150eb4e97ab2e7e441b53e9451",
- "sha256:7ce4a213a96d6c25eeae2f7d60d4dad89ac2b8134ec3e69db9bc522e2c0f9388",
- "sha256:8127ca2bf9539d6a64d03686fd9e789e8c194fc19af49b69b081f8c7e6ecb1bc",
- "sha256:b5b5b2e95f761a88d4c93691716ce01dc55f288a153face1654f868a8034f494",
- "sha256:b7c9f65524ff06bf70c945cd8d8d1fd90853e27ccf86026af2afb4d9a63d06b1",
- "sha256:f7f2f4226db6acd1da228adf433c5c3792858474e49d80668ea82ac87cf74a03",
- "sha256:fa09da4af4e5b15c0e8b4986a083f3fd159302ea115a6cc0649cd163435538b8"
- ],
- "version": "==2020.5.7"
- },
- "requests": {
- "hashes": [
- "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
- "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
- ],
- "version": "==2.23.0"
- },
- "scikit-learn": {
- "hashes": [
- "sha256:1bf45e62799b6938357cfce19f72e3751448c4b27010e4f98553da669b5bbd86",
- "sha256:267ad874b54c67b479c3b45eb132ef4a56ab2b27963410624a413a4e2a3fc388",
- "sha256:2d1bb83d6c51a81193d8a6b5f31930e2959c0e1019d49bdd03f54163735dae4b",
- "sha256:349ba3d837fb3f7cb2b91486c43713e4b7de17f9e852f165049b1b7ac2f81478",
- "sha256:3f4d8eea3531d3eaf613fa33f711113dfff6021d57a49c9d319af4afb46f72f0",
- "sha256:4990f0e166292d2a0f0ee528233723bcfd238bfdb3ec2512a9e27f5695362f35",
- "sha256:57538d138ba54407d21e27c306735cbd42a6aae0df6a5a30c7a6edde46b0017d",
- "sha256:5b722e8bb708f254af028dc2da86d23df5371cba57e24f889b672e7b15423caa",
- "sha256:6043e2c4ccfc68328c331b0fc19691be8fb02bd76d694704843a23ad651de902",
- "sha256:672ea38eb59b739a8907ec063642b486bcb5a2073dda5b72b7983eeaf1fd67c1",
- "sha256:73207dca6e70f8f611f28add185cf3a793c8232a1722f21d82259560dc35cd50",
- "sha256:83fc104a799cb340054e485c25dfeee712b36f5638fb374eba45a9db490f16ff",
- "sha256:8416150ab505f1813da02cdbdd9f367b05bfc75cf251235015bb09f8674358a0",
- "sha256:84e759a766c315deb5c85139ff879edbb0aabcddb9358acf499564ed1c21e337",
- "sha256:8ed66ab27b3d68e57bb1f315fc35e595a5c4a1f108c3420943de4d18fc40e615",
- "sha256:a7f8aa93f61aaad080b29a9018db93ded0586692c03ddf2122e47dd1d3a14e1b",
- "sha256:ddd3bf82977908ff69303115dd5697606e669d8a7eafd7d83bb153ef9e11bd5e",
- "sha256:de9933297f8659ee3bb330eafdd80d74cd73d5dab39a9026b65a4156bc479063",
- "sha256:ea91a70a992ada395efc3d510cf011dc2d99dc9037bb38cd1cb00e14745005f5",
- "sha256:eb4c9f0019abb374a2e55150f070a333c8f990b850d1eb4dfc2765fc317ffc7c",
- "sha256:ffce8abfdcd459e72e5b91727b247b401b22253cbd18d251f842a60e26262d6f"
- ],
- "version": "==0.22.2.post1"
- },
- "scipy": {
- "hashes": [
- "sha256:00af72998a46c25bdb5824d2b729e7dabec0c765f9deb0b504f928591f5ff9d4",
- "sha256:0902a620a381f101e184a958459b36d3ee50f5effd186db76e131cbefcbb96f7",
- "sha256:1e3190466d669d658233e8a583b854f6386dd62d655539b77b3fa25bfb2abb70",
- "sha256:2cce3f9847a1a51019e8c5b47620da93950e58ebc611f13e0d11f4980ca5fecb",
- "sha256:3092857f36b690a321a662fe5496cb816a7f4eecd875e1d36793d92d3f884073",
- "sha256:386086e2972ed2db17cebf88610aab7d7f6e2c0ca30042dc9a89cf18dcc363fa",
- "sha256:71eb180f22c49066f25d6df16f8709f215723317cc951d99e54dc88020ea57be",
- "sha256:770254a280d741dd3436919d47e35712fb081a6ff8bafc0f319382b954b77802",
- "sha256:787cc50cab3020a865640aba3485e9fbd161d4d3b0d03a967df1a2881320512d",
- "sha256:8a07760d5c7f3a92e440ad3aedcc98891e915ce857664282ae3c0220f3301eb6",
- "sha256:8d3bc3993b8e4be7eade6dcc6fd59a412d96d3a33fa42b0fa45dc9e24495ede9",
- "sha256:9508a7c628a165c2c835f2497837bf6ac80eb25291055f56c129df3c943cbaf8",
- "sha256:a144811318853a23d32a07bc7fd5561ff0cac5da643d96ed94a4ffe967d89672",
- "sha256:a1aae70d52d0b074d8121333bc807a485f9f1e6a69742010b33780df2e60cfe0",
- "sha256:a2d6df9eb074af7f08866598e4ef068a2b310d98f87dc23bd1b90ec7bdcec802",
- "sha256:bb517872058a1f087c4528e7429b4a44533a902644987e7b2fe35ecc223bc408",
- "sha256:c5cac0c0387272ee0e789e94a570ac51deb01c796b37fb2aad1fb13f85e2f97d",
- "sha256:cc971a82ea1170e677443108703a2ec9ff0f70752258d0e9f5433d00dda01f59",
- "sha256:dba8306f6da99e37ea08c08fef6e274b5bf8567bb094d1dbe86a20e532aca088",
- "sha256:dc60bb302f48acf6da8ca4444cfa17d52c63c5415302a9ee77b3b21618090521",
- "sha256:dee1bbf3a6c8f73b6b218cb28eed8dd13347ea2f87d572ce19b289d6fd3fbc59"
- ],
- "version": "==1.4.1"
- },
- "selenium": {
- "hashes": [
- "sha256:2d7131d7bc5a5b99a2d9b04aaf2612c411b03b8ca1b1ee8d3de5845a9be2cb3c",
- "sha256:deaf32b60ad91a4611b98d8002757f29e6f2c2d5fcaf202e1c9ad06d6772300d"
- ],
- "index": "pypi",
- "version": "==3.141.0"
- },
- "six": {
- "hashes": [
- "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
- "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
- ],
- "version": "==1.14.0"
- },
- "soupsieve": {
- "hashes": [
- "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae",
- "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69"
- ],
- "version": "==2.0"
- },
- "tqdm": {
- "hashes": [
- "sha256:4733c4a10d0f2a4d098d801464bdaf5240c7dadd2a7fde4ee93b0a0efd9fb25e",
- "sha256:acdafb20f51637ca3954150d0405ff1a7edde0ff19e38fb99a80a66210d2a28f"
- ],
- "version": "==4.46.0"
- },
- "urllib3": {
- "hashes": [
- "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527",
- "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115"
- ],
- "version": "==1.25.9"
- },
- "wcwidth": {
- "hashes": [
- "sha256:cafe2186b3c009a04067022ce1dcd79cb38d8d65ee4f4791b8888d6599d1bbe1",
- "sha256:ee73862862a156bf77ff92b09034fc4825dd3af9cf81bc5b360668d425f3c5f1"
- ],
- "version": "==0.1.9"
- },
- "webdriver-manager": {
- "hashes": [
- "sha256:87f3f4bfda4917fa0ef1387fd1ddbbb5738e6961eb846434895801167771f652"
- ],
- "index": "pypi",
- "version": "==2.4.0"
- },
- "zipp": {
- "hashes": [
- "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b",
- "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96"
- ],
- "version": "==3.1.0"
- }
- },
- "develop": {}
-}
diff --git a/demo/assests/demo.gif b/demo/assests/demo.gif
deleted file mode 100644
index 275a1748..00000000
Binary files a/demo/assests/demo.gif and /dev/null differ
diff --git a/demo/assests/demo.png b/demo/assests/demo.png
deleted file mode 100644
index 13cf1a75..00000000
Binary files a/demo/assests/demo.png and /dev/null differ
diff --git a/demo/demo.png b/demo/demo.png
new file mode 100644
index 00000000..a39ddb4c
Binary files /dev/null and b/demo/demo.png differ
diff --git a/demo/readme.md b/demo/readme.md
deleted file mode 100644
index f1369f9a..00000000
--- a/demo/readme.md
+++ /dev/null
@@ -1,15 +0,0 @@
-To try this demo simply open a terminal on this folder and type:
-
-```bash
-funnel -s settings.yaml
-```
-
-Feel free to copy the settings file to configure your own job searches.
-If installed via `pip`, the `funnel` command can be called from any folder in your system.
-
-__*Note*__: JobFunnel prioritizes Linux development.
-Therefore, naturally Jobfunnel works best in Linux.
-It has been tested on other operating systems such as Windows.
-Get in touch with the developers if you would like to maintain macOS or Windows support.
-
-![Demo GIF](assests/demo.gif)
diff --git a/demo/settings.yaml b/demo/settings.yaml
index 23b93e64..bd6cc0c5 100644
--- a/demo/settings.yaml
+++ b/demo/settings.yaml
@@ -1,46 +1,63 @@
-# all paths are relative to this file
-
-# paths
-output_path: './'
-
-# providers from which to search (case insensitive)
-providers:
-
- - 'Indeed'
- - 'Monster'
- - 'GlassDoorStatic'
- # - 'GlassDoorDynamic'
-
-# filters
-search_terms:
- region:
- province: 'ON'
- city: 'waterloo'
- domain: 'ca'
- radius: 10
-
+# This is an example of a feature-complete JobFunnel configuration YAML.
+# Try this out by simply running: "funnel load -s demo/settings.yaml"
+
+# Path where your master CSV, block-lists, and cache data will be stored
+# NOTE: we create any missing directories in these filepaths
+master_csv_file: demo_job_search_results/demo_search.csv
+cache_folder: demo_job_search_results/cache
+block_list_file: demo_job_search_results/demo_block_list.json
+duplicates_list_file: demo_job_search_results/demo_duplicates_list.json
+log_file: demo_job_search_results/log.log
+
+# Job search configuration
+search:
+
+ # Locale settings, one of USA_ENGLISH, CANADA_ENGLISH, CANADA_FRENCH:
+ # This tells JobFunnel where the website we are scraping is located, and
+ # what language the contents are in.
+ locale: CANADA_ENGLISH
+
+ # Job providers which we will search, one of INDEED, MONSTER, GLASSDOOR:
+ # NOTE: we choose domain via locale (i.e. CANADA_ENGLISH -> www.indeed.ca)
+ # FIXME: we need to add back GLASSDOOR when that's working again.
+ providers:
+ - INDEED
+ - MONSTER
+
+ # Region that we are searching for jobs within:
+ province_or_state: "ON" # NOTE: this is generally 2 characters long.
+ city: "Waterloo" # NOTE: this is the full city / town name.
+ radius: 25 # km (NOTE: if we were in locale: USA_ENGLISH it's in miles)
+
+ # These are the terms you would be typing into the website's search field:
keywords:
- - 'Python'
-
-black_list:
- - 'Infox Consulting'
- - 'Terminal'
-
-# logging level options are: critical, error, warning, info, debug, notset
-log_level: 'info'
-
-# saves duplicates removed by tfidf filter to duplicate_list.csv
-save_duplicates: False
-
-# delaying algorithm configuration
-delay_config:
- # functions used for delaying algorithm, options are: constant, linear, sigmoid
- function: 'linear'
- # maximum delay/upper bound for converging random delay
- delay: 10
- # minimum delay/lower bound for random delay
- min_delay: 1
- # random delay
- random: True
- # converging random delay, only used if 'random' is set to True
- converge: True
+ - Python
+
+ # Don't return any listings older than this:
+ max_listing_days: 35
+
+ # Blocked company names that will never appear in any results:
+ company_block_list:
+ - "Infox Consulting"
+
+# Logging level options are: critical, error, warning, info, debug, notset
+log_level: INFO
+
+# Delaying algorithm configuration
+delay:
+ # Functions used for delaying algorithm: CONSTANT, LINEAR, SIGMOID
+ algorithm: LINEAR
+ # Maximum delay/upper bound for converging random delay
+ max_duration: 5.0
+ # Minimum delay/lower bound for random delay
+ min_duration: 1.0
+ # Random delay
+ random: False
+ # Converging random delay, only used if 'random' is set to True
+ converging: False
+
+# # Proxy settings
+# proxy:
+# protocol: https # NOTE: you can also set to 'http'
+# ip: "1.1.1.1"
+# port: '200'
diff --git a/docs/crontab/cronjob.sh b/docs/crontab/cronjob.sh
index d1bc0d07..36b0bb04 100755
--- a/docs/crontab/cronjob.sh
+++ b/docs/crontab/cronjob.sh
@@ -7,6 +7,6 @@ do
if [ -d "$DUMP/$location" ] && echo "funnel scaping job for $location @ $(date +"%T")"
then
cd $DUMP/$location &&
-~/.local/bin/funnel -s settings.yaml > cronjob.log 2>&1
+~/.local/bin/funnel load -s settings.yaml > cronjob.log 2>&1
fi
done
diff --git a/docs/gen_call_graphs.sh b/docs/gen_call_graphs.sh
new file mode 100755
index 00000000..f8d0f565
--- /dev/null
+++ b/docs/gen_call_graphs.sh
@@ -0,0 +1,11 @@
+# Install pyan3 via pip3 install pyan3 and then you can do below:
+echo "building call graph .dot files in ./call_graphs"
+
+mkdir ./call_graphs
+pyan3 jobfunnel/backend/tools/filters.py -c --dot > ./call_graphs/filters.dot
+pyan3 jobfunnel/backend/scrapers/indeed.py -c --dot > ./call_graphs/indeed.dot
+pyan3 jobfunnel/backend/jobfunnel.py -c --dot > ./call_graphs/jobfunnel.dot
+
+echo "Done."
+# Then you can visualize the created files with graphviz by making svg, or you
+# can copypaste their contents here and look online: http://www.webgraphviz.com/
diff --git a/docs/pycharm/images/debug_configurations.png b/docs/pycharm/images/debug_configurations.png
deleted file mode 100644
index 2bdb8c3a..00000000
Binary files a/docs/pycharm/images/debug_configurations.png and /dev/null differ
diff --git a/docs/pycharm/images/pycharm.png b/docs/pycharm/images/pycharm.png
deleted file mode 100644
index 4f85fccd..00000000
Binary files a/docs/pycharm/images/pycharm.png and /dev/null differ
diff --git a/docs/pycharm/images/pycharm_banner.png b/docs/pycharm/images/pycharm_banner.png
deleted file mode 100644
index b28385a8..00000000
Binary files a/docs/pycharm/images/pycharm_banner.png and /dev/null differ
diff --git a/docs/pycharm/images/svg/pycharm.svg b/docs/pycharm/images/svg/pycharm.svg
deleted file mode 100644
index d186b7a5..00000000
--- a/docs/pycharm/images/svg/pycharm.svg
+++ /dev/null
@@ -1,71 +0,0 @@
-
-
-
diff --git a/docs/pycharm/images/svg/pycharm_banner.svg b/docs/pycharm/images/svg/pycharm_banner.svg
deleted file mode 100644
index fffd8817..00000000
--- a/docs/pycharm/images/svg/pycharm_banner.svg
+++ /dev/null
@@ -1,37 +0,0 @@
-
-
-
diff --git a/docs/pycharm/readme.md b/docs/pycharm/readme.md
deleted file mode 100644
index 89752f83..00000000
--- a/docs/pycharm/readme.md
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-This document is a guide to developing this project with PyCharm.
-
-### Installing PyCharm
-
-Go to the JetBrains website for instructions on how to install.
-https://www.jetbrains.com/help/pycharm/installation-guide.html
-
-### Installing Pipenv
-
-Go to the Jetbrains website for instructions on how to install and set up pipenv.
-https://www.jetbrains.com/help/pycharm/pipenv.html
-
-### Setting Debug Configurations
-
-Do not follow these fields exactly. However, if you go to _Run->Edit Configurations..._
-make a new run configuration by clicking _+_. Click _Python_ and fill in the fields
-as shown below.
-
diff --git a/images/jobfunnel.png b/images/jobfunnel.png
deleted file mode 100644
index 02b7dd02..00000000
Binary files a/images/jobfunnel.png and /dev/null differ
diff --git a/images/jobfunnel_banner.png b/images/jobfunnel_banner.png
deleted file mode 100644
index 11b8ee12..00000000
Binary files a/images/jobfunnel_banner.png and /dev/null differ
diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
index d3a156bb..da1b425d 100644
--- a/jobfunnel/__init__.py
+++ b/jobfunnel/__init__.py
@@ -1 +1,3 @@
-__version__ = '2.1.9'
+"""JobFunnel base package init, we keep module version here.
+"""
+__version__ = '3.0.0'
diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py
index 220dfc86..e816a939 100755
--- a/jobfunnel/__main__.py
+++ b/jobfunnel/__main__.py
@@ -1,74 +1,30 @@
#!python
-"""main script, scrapes data off several listings, pickles it,
-and applies search filters"""
+"""Builds a config from CLI, runs desired scrapers and updates JSON + CSV
+"""
import sys
-
-from typing import Union
-
-from .config.parser import parse_config, ConfigError
-from .config.validate import validate_config
-
-from .jobfunnel import JobFunnel
-from .indeed import Indeed
-from .monster import Monster
-from .glassdoor_base import GlassDoorBase
-from .glassdoor_dynamic import GlassDoorDynamic
-from .glassdoor_static import GlassDoorStatic
-
-PROVIDERS = {
- 'indeed': Indeed,
- 'monster': Monster,
- 'glassdoorstatic': GlassDoorStatic,
- 'glassdoordynamic': GlassDoorDynamic
-}
+from .backend.jobfunnel import JobFunnel
+from .config import parse_cli, build_config_dict, get_config_manager
def main():
- """main function"""
- try:
- config = parse_config()
- validate_config(config)
-
- except ConfigError as e:
- print(e.strerror)
- sys.exit()
-
- # init class + logging
- jf = JobFunnel(config)
- jf.init_logging()
-
- # parse the master list path to update filter list
- jf.update_filterjson()
-
- # get jobs by either scraping jobs or loading dumped pickles
- if config['recover']:
- jf.load_pickles(config)
- elif config['no_scrape']:
- jf.load_pickle(config)
+ """Parse CLI and call jobfunnel() to manage scrapers and lists
+ """
+ # Parse CLI into validated schema
+ args = parse_cli(sys.argv[1:])
+ cfg_dict = build_config_dict(args)
+
+ # Build config manager
+ funnel_cfg = get_config_manager(cfg_dict)
+ funnel_cfg.create_dirs()
+
+ # Init
+ job_funnel = JobFunnel(funnel_cfg)
+
+ # Run or recover
+ if args['do_recovery_mode']:
+ job_funnel.recover()
else:
- for p in config['providers']:
- # checks to see if provider is glassdoor
- provider: Union[Monster,
- Indeed, GlassDoorDynamic, GlassDoorStatic] = PROVIDERS[p](config)
-
- provider_id = provider.__class__.__name__
-
- try:
- provider.scrape()
- jf.scrape_data.update(provider.scrape_data)
- except Exception as e:
- jf.logger.error(
- f'failed to scrape {provider_id}: {str(e)}')
-
- # dump scraped data to pickle
- jf.dump_pickle()
-
- # filter scraped data and dump to the masterlist file
- jf.update_masterlist()
-
- # done!
- jf.logger.info('done. see un-archived jobs in ' +
- config['master_list_path'])
+ job_funnel.run()
if __name__ == '__main__':
diff --git a/jobfunnel/backend/__init__.py b/jobfunnel/backend/__init__.py
new file mode 100644
index 00000000..0188e570
--- /dev/null
+++ b/jobfunnel/backend/__init__.py
@@ -0,0 +1 @@
+from jobfunnel.backend.job import Job, JobStatus
diff --git a/jobfunnel/backend/job.py b/jobfunnel/backend/job.py
new file mode 100644
index 00000000..655f1a31
--- /dev/null
+++ b/jobfunnel/backend/job.py
@@ -0,0 +1,227 @@
+"""Base Job class to be populated by Scrapers, manipulated by Filters and saved
+to csv / etc by Exporter
+"""
+from copy import deepcopy
+from datetime import date, datetime
+from typing import Dict, List, Optional
+
+from bs4 import BeautifulSoup
+
+from jobfunnel.resources import (CSV_HEADER, MAX_BLOCK_LIST_DESC_CHARS,
+ MIN_DESCRIPTION_CHARS, PRINTABLE_STRINGS,
+ JobStatus, Locale)
+
+# If job.status == one of these we filter it out of results
+JOB_REMOVE_STATUSES = [
+ JobStatus.DELETE, JobStatus.ARCHIVE, JobStatus.REJECTED, JobStatus.OLD
+]
+
+
+class Job():
+ """The base Job object which contains job information as attribs
+ """
+ def __init__(self,
+ title: str,
+ company: str,
+ location: str,
+ description: str,
+ url: str,
+ locale: Locale,
+ query: str,
+ provider: str,
+ status: JobStatus,
+ key_id: Optional[str] = '',
+ scrape_date: Optional[date] = None,
+ short_description: Optional[str] = None,
+ post_date: Optional[date] = None,
+ raw: Optional[BeautifulSoup] = None,
+ wage: Optional[str] = None,
+ tags: Optional[List[str]] = None,
+ remote: Optional[str] = None) -> None:
+ """Object to represent a single job that we have scraped
+
+ TODO integrate init with JobField somehow, ideally with validation.
+ TODO: would be nice to use something standardized for location str
+ TODO: wage ought to be a number or an object, but is str for flexibility
+ NOTE: ideally key_id is provided, but Monster sets() it, so it now
+ has a default = None and is checked for in validate()
+
+ Args:
+ title (str): title of the job (should be somewhat short)
+ company (str): company the job was posted for (should also be short)
+ location (str): string that tells the user where the job is located
+ description (str): content of job description, ideally this is human
+ readable.
+ key_id (str): unique identifier for the job TODO: make more robust?
+ url (str): link to the page where the job exists
+ locale (Locale): identifier to help us with internationalization,
+ tells us what the locale of the scraper was that scraped this
+ job.
+ query (str): the search string that this job was found with
+ provider (str): name of the job source
+ status (JobStatus): the status of the job (i.e. new)
+ scrape_date (Optional[date]): date the job was scraped, Defaults
+ to the time that the job object is created.
+ short_description (Optional[str]): user-readable short description
+ (one-liner)
+ post_date (Optional[date]): the date the job became available on the
+ job source. Defaults to None.
+ raw (Optional[BeautifulSoup]): raw scrape data that we can use for
+ debugging/pickling, defualts to None.
+ wage (Optional[str], optional): string describing wage (may be est)
+ tags (Optional[List[str]], optional): additional key-words that are
+ in the job posting that identify the job. Defaults to [].
+ remote (Optional[str], optional): string describing remote work
+ allowance/status i.e. ('temporarily remote', 'fully remote' etc)
+ """
+ # These must be populated by a Scraper
+ self.title = title
+ self.company = company
+ self.location = location
+ self.description = description
+ self.key_id = key_id
+ self.url = url
+ self.locale = locale
+ self.query = query
+ self.provider = provider
+ self.status = status
+ self.wage = wage
+ self.remote = remote
+
+ # These may not always be populated in our job source
+ self.post_date = post_date
+ self.scrape_date = scrape_date if scrape_date else datetime.today()
+ self.tags = tags if tags else []
+ if short_description:
+ self.short_description = short_description
+ else:
+ self.short_description = ''
+
+ # Semi-private attrib for debugging
+ self._raw_scrape_data = raw
+
+ @property
+ def is_remove_status(self) -> bool:
+ """Return True if the job's status is one of our removal statuses.
+ """
+ return self.status in JOB_REMOVE_STATUSES
+
+ def update_if_newer(self, job: 'Job') -> bool:
+ """Update an existing job with new metadata but keep user's status,
+ but only if the job.post_date > existing_job.post_date!
+
+ NOTE: if you have hours or minutes or seconds set, and jobs were scraped
+ on the same day, the comparison will favour the extra info as newer!
+ TODO: we should do more checks to ensure we are not seeing a totally
+ different job by accident (since this check is usually done by key_id)
+ TODO: Currently we do day precision but if we wanted to update because
+ something is newer by hours we will need to revisit this limitation and
+ store scrape hour/etc in the CSV as well.
+
+ Returns:
+ True if we updated self with job, False if we didn't
+ """
+ if (job.post_date > self.post_date):
+ # Update all attrs other than status (which user can set).
+ self.company = deepcopy(job.company)
+ self.location = deepcopy(job.location)
+ self.description = deepcopy(job.description)
+ self.key_id = deepcopy(job.key_id) # NOTE: be careful doing this!
+ self.url = deepcopy(job.url)
+ self.locale = deepcopy(job.locale)
+ self.query = deepcopy(job.query)
+ self.provider = deepcopy(job.provider)
+ self.status = deepcopy(job.status)
+ self.wage = deepcopy(job.wage)
+ self.remote = deepcopy(job.remote)
+ self.post_date = deepcopy(job.post_date)
+ self.scrape_date = deepcopy(job.scrape_date)
+ self.tags = deepcopy(job.tags)
+ self.short_description = deepcopy(job.short_description)
+ # pylint: disable=protected-access
+ self._raw_scrape_data = deepcopy(job._raw_scrape_data)
+ # pylint: enable=protected-access
+ return True
+ else:
+ return False
+
+ def is_old(self, max_age: datetime) -> bool:
+ """Identify if a job is older than a certain max_age
+
+ Args:
+ max_age_days: maximum allowable age for a job
+
+ Returns:
+ True if it's older than number of days
+ False if it's fresh enough to keep
+ """
+ return self.post_date < max_age
+
+ @property
+ def as_row(self) -> Dict[str, str]:
+ """Builds a CSV row dict for this job entry
+
+ TODO: this is legacy, no support for short_description yet.
+ NOTE: RAW cannot be put into CSV.
+ """
+ return dict([
+ (h, v) for h,v in zip(
+ CSV_HEADER,
+ [
+ self.status.name,
+ self.title,
+ self.company,
+ self.location,
+ self.post_date.strftime('%Y-%m-%d'),
+ self.description,
+ ', '.join(self.tags),
+ self.url,
+ self.key_id,
+ self.provider,
+ self.query,
+ self.locale.name,
+ self.wage,
+ self.remote,
+ ]
+ )
+ ])
+
+ @property
+ def as_json_entry(self) -> Dict[str, str]:
+ """This formats a job for the purpose of saving it to a block JSON
+ i.e. duplicates list file or user's block list file
+ NOTE: we truncate descriptions in block lists
+ """
+ return {
+ 'title': self.title,
+ 'company': self.company,
+ 'post_date': self.post_date.strftime('%Y-%m-%d'),
+ 'description': (
+ self.description[:MAX_BLOCK_LIST_DESC_CHARS] + '..'
+ ) if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS else (
+ self.description
+ ),
+ 'status': self.status.name,
+ }
+
+ def clean_strings(self) -> None:
+ """Ensure that all string fields have only printable chars
+ TODO: maybe we can use stopwords?
+ """
+ for attr in [self.title, self.company, self.description, self.tags,
+ self.url, self.key_id, self.provider, self.query,
+ self.wage]:
+ attr = ''.join(
+ filter(lambda x: x in PRINTABLE_STRINGS, attr)
+ )
+
+ def validate(self) -> None:
+ """Simple checks just to ensure that the metadata is good
+ TODO: consider expanding to cover all attribs.
+ """
+ assert self.key_id, "Key_ID is unset!"
+ assert self.title, "Title is unset!"
+ assert self.company, "Company is unset!"
+ assert self.url, "URL is unset!"
+ if len(self.description) < MIN_DESCRIPTION_CHARS:
+ raise ValueError("Description too short!")
diff --git a/jobfunnel/backend/jobfunnel.py b/jobfunnel/backend/jobfunnel.py
new file mode 100755
index 00000000..3c8c5b0a
--- /dev/null
+++ b/jobfunnel/backend/jobfunnel.py
@@ -0,0 +1,539 @@
+"""Scrapes jobs, applies search filters and writes pickles to master list
+Paul McInnis 2020
+"""
+import csv
+import json
+import os
+import pickle
+from datetime import date, datetime, timedelta
+from time import time
+from typing import Dict, List
+
+from requests import Session
+
+from jobfunnel import __version__
+from jobfunnel.backend import Job
+from jobfunnel.backend.tools import Logger
+from jobfunnel.backend.tools.filters import DuplicatedJob, JobFilter
+from jobfunnel.config import JobFunnelConfigManager
+from jobfunnel.resources import (CSV_HEADER, T_NOW,
+ DuplicateType, JobStatus, Locale)
+
+
+class JobFunnel(Logger):
+ """Class that initializes a Scraper and scrapes a website to get jobs
+ """
+
+ def __init__(self, config: JobFunnelConfigManager) -> None:
+ """Initialize a JobFunnel object, with a JobFunnel Config
+
+ Args:
+ config (JobFunnelConfigManager): config object containing paths etc.
+ """
+ config.validate() # NOTE: this ensures log file path exists
+ super().__init__(level=config.log_level, file_path=config.log_file)
+ self.config = config
+ self.__date_string = date.today().strftime("%Y-%m-%d")
+ self.master_jobs_dict = {} # type: Dict[str, Job]
+
+ # Open a session with/out a proxy configured
+ self.session = Session()
+ if self.config.proxy_config:
+ self.session.proxies = {
+ self.config.proxy_config.protocol: self.config.proxy_config.url
+ }
+
+ # Read the user's block list
+ user_block_jobs_dict = {} # type: Dict[str, str]
+ if os.path.isfile(self.config.user_block_list_file):
+ user_block_jobs_dict = json.load(
+ open(self.config.user_block_list_file, 'r')
+ )
+
+ # Read the user's duplicate jobs list (from TFIDF)
+ duplicate_jobs_dict = {} # type: Dict[str, str]
+ if os.path.isfile(self.config.duplicates_list_file):
+ duplicate_jobs_dict = json.load(
+ open(self.config.duplicates_list_file, 'r')
+ )
+
+ # Initialize our job filter
+ self.job_filter = JobFilter(
+ user_block_jobs_dict,
+ duplicate_jobs_dict,
+ self.config.search_config.blocked_company_names,
+ T_NOW - timedelta(days=self.config.search_config.max_listing_days),
+ log_level=self.config.log_level,
+ log_file=self.config.log_file,
+ )
+
+ @property
+ def daily_cache_file(self) -> str:
+ """The name for for pickle file containing the scraped data ran today'
+ TODO: instead of using a 'daily' cache file, we should be tying this
+ into the search that was made to prevent cross-caching results.
+ """
+ return os.path.join(
+ self.config.cache_folder, f"jobs_{self.__date_string}.pkl",
+ )
+
+ def run(self) -> None:
+ """Scrape, update lists and save to CSV.
+ """
+ # Read the master CSV file
+ if os.path.isfile(self.config.master_csv_file):
+ self.master_jobs_dict = self.read_master_csv()
+
+ # Load master csv jobs if they exist and update our block list with
+ # any jobs the user has set the status to == a remove status
+ # NOTE: we want to do this first to make our filters use current info.
+ if self.master_jobs_dict:
+ self.update_user_block_list()
+ else:
+ self.logger.debug(
+ "No master-CSV present, did not update block-list: %s",
+ self.config.user_block_list_file
+ )
+
+ # Scrape jobs or load them from a cache if one exists (--no-scrape)
+ scraped_jobs_dict = {} # type: Dict[str, Job]
+ if self.config.no_scrape:
+
+ # Load cache since --no-scrape is set
+ self.logger.info("Skipping scraping, running with --no-scrape.")
+ if os.path.exists(self.daily_cache_file):
+ scraped_jobs_dict = self.load_cache(self.daily_cache_file)
+ else:
+ self.logger.warning(
+ "No incoming jobs, missing cache: %s", self.daily_cache_file
+ )
+ else:
+
+ # Scrape new jobs from all our configured providers and cache them
+ scraped_jobs_dict = self.scrape()
+ self.write_cache(scraped_jobs_dict)
+
+ # Filter out any jobs we have rejected, archived or block-listed
+ # NOTE: we do not remove duplicates here as these may trigger updates
+ if scraped_jobs_dict:
+ scraped_jobs_dict = self.job_filter.filter(
+ scraped_jobs_dict, remove_existing_duplicate_keys=False
+ )
+ if self.master_jobs_dict:
+ self.master_jobs_dict = self.job_filter.filter(
+ self.master_jobs_dict, remove_existing_duplicate_keys=False,
+ )
+
+ # Parse duplicate jobs into updates for master jobs dict
+ # NOTE: we prevent inter-scrape duplicates by key-id within BaseScraper
+ # FIXME: impl. TFIDF on inter-scrape duplicates
+ duplicate_jobs = [] # type: List[DuplicatedJob]
+ if self.master_jobs_dict and scraped_jobs_dict:
+
+ # Remove jobs with duplicated key_ids from scrape + update master
+ duplicate_jobs = self.job_filter.find_duplicates(
+ self.master_jobs_dict, scraped_jobs_dict,
+ )
+
+ for match in duplicate_jobs:
+
+ # Was it a key-id match?
+ if match.type in [DuplicateType.KEY_ID or
+ DuplicateType.EXISTING_TFIDF]:
+
+ # NOTE: original and duplicate have same key id for these.
+ # When it's EXISTING_TFIDF, we can't set match.duplicate
+ # because it is only partially stored in the block list JSON
+ if match.original.key_id and (match.original.key_id
+ != match.duplicate.key_id):
+ raise ValueError(
+ "Found duplicate by key-id, but keys dont match! "
+ f"{match.original.key_id}, {match.duplicate.key_id}"
+ )
+
+ # Got a key-id match, pop from scrape dict and maybe update
+ upd = self.master_jobs_dict[
+ match.duplicate.key_id].update_if_newer(
+ scraped_jobs_dict.pop(match.duplicate.key_id))
+
+ self.logger.debug(
+ "Identified duplicate %s by key-id and %s original job "
+ "with its data.",
+ match.duplicate.key_id,
+ 'updated older' if upd else 'did not update',
+ )
+
+ # Was it a content-match?
+ elif match.type == DuplicateType.NEW_TFIDF:
+
+ # Got a content match, pop from scrape dict and maybe update
+ upd = self.master_jobs_dict[
+ match.original.key_id].update_if_newer(
+ scraped_jobs_dict.pop(match.duplicate.key_id)
+ )
+ self.logger.debug(
+ "Identified %s as a duplicate by description and %s "
+ "original job %s with its data.",
+ match.duplicate.key_id,
+ 'updated older' if upd else 'did not update',
+ match.original.key_id,
+ )
+
+ # Update duplicates file (if any updates are incoming)
+ if duplicate_jobs:
+ self.update_duplicates_file()
+
+ # Update master jobs dict with the incoming jobs that passed filters
+ if scraped_jobs_dict:
+ self.master_jobs_dict.update(scraped_jobs_dict)
+
+ # Write-out to CSV or log messages
+ if self.master_jobs_dict:
+
+ # Write our updated jobs out (if none, dont make the file at all)
+ self.write_master_csv(self.master_jobs_dict)
+ self.logger.info(
+ "Done. View your current jobs in %s",
+ self.config.master_csv_file
+ )
+
+ else:
+ # We got no new, unique jobs. This is normal if loading scrape
+ # with --no-scrape as all jobs are removed by duplicate filter
+ if self.config.no_scrape:
+ # User is running --no-scrape probably just to update lists
+ self.logger.debug("No new jobs were added.")
+ else:
+ self.logger.warning("No new jobs were added to CSV.")
+
+ def _check_for_inter_scraper_validity(self, existing_jobs: Dict[str, Job],
+ incoming_jobs: Dict[str, Job],
+ ) -> None:
+ """Verify that we aren't overwriting jobs by key-id between scrapers
+ NOTE: this is a slow check, would be cool to improve the O(n) on this
+ """
+ existing_job_keys = existing_jobs.keys()
+ for inc_key_id in incoming_jobs.keys():
+ for exist_key_id in existing_job_keys:
+ if inc_key_id == exist_key_id:
+ raise ValueError(
+ f"Inter-scraper key-id duplicate! {exist_key_id}"
+ )
+
+ def scrape(self) ->Dict[str, Job]:
+ """Run each of the desired Scraper.scrape() with threading and delaying
+ """
+ self.logger.info(
+ "Scraping local providers with: %s", self.config.scraper_names
+ )
+
+ # Iterate thru scrapers and run their scrape.
+ jobs = {} # type: Dict[str, Job]
+ for scraper_cls in self.config.scrapers:
+ start = time()
+ scraper = scraper_cls(self.session, self.config, self.job_filter)
+ incoming_jobs_dict = scraper.scrape()
+
+ # Ensure we have no duplicates between our scrapers by key-id
+ # (since we are updating the jobs dict with results)
+ self._check_for_inter_scraper_validity(
+ incoming_jobs_dict,
+ jobs,
+ )
+
+ jobs.update()
+ end = time()
+ self.logger.debug(
+ "Scraped %d jobs from %s, took %.3fs",
+ len(jobs.items()), scraper_cls.__name__, (end - start),
+ )
+
+ self.logger.info(
+ "Completed all scraping, found %d new jobs.", len(jobs)
+ )
+ return jobs
+
+ def recover(self) -> None:
+ """Build a new master CSV from all the available pickles in our cache
+ """
+ self.logger.info("Recovering jobs from all cache files in cache folder")
+ if os.path.exists(self.config.user_block_list_file):
+ self.logger.warning(
+ "Running recovery mode, but with existing block-list, delete "
+ "%s if you want to start fresh from the cached data and not "
+ "filter any jobs away.", self.config.user_block_list_file
+ )
+ all_jobs_dict = {} # type: Dict[str, Job]
+ for file in os.listdir(self.config.cache_folder):
+ if '.pkl' in file:
+ all_jobs_dict.update(
+ self.load_cache(
+ os.path.join(self.config.cache_folder, file)
+ )
+ )
+ self.write_master_csv(self.job_filter.filter(all_jobs_dict))
+
+ def load_cache(self, cache_file: str) -> Dict[str, Job]:
+
+ """Load today's scrape data from pickle via date string
+
+ TODO: search the cache for pickles that match search config.
+ (we may need a registry for the pickles and seach terms used)
+
+ Args:
+ cache_file (str): path to cache pickle file containing jobs dict
+ keyed by Job.KEY_ID.
+
+ Raises:
+ FileNotFoundError: if cache file is missing
+
+ Returns:
+ Dict[str, Job]: [description]
+ """
+ if not os.path.exists(cache_file):
+ raise FileNotFoundError(
+ f"{cache_file} not found! Have you scraped any jobs today?"
+ )
+ else:
+ cache_dict = pickle.load(open(cache_file, 'rb'))
+ jobs_dict = cache_dict['jobs_dict']
+ version = cache_dict['version']
+ if version != __version__:
+ # NOTE: this may be an error in the future
+ self.logger.warning(
+ "Loaded jobs cache has version mismatch! "
+ "cache version: %s, current version: %s",
+ version, __version__
+ )
+ self.logger.info(
+ "Read %d jobs from previously-scraped jobs cache: %s.",
+ len(jobs_dict.keys()), cache_file,
+ )
+ self.logger.debug(
+ "NOTE: you may see many duplicate IDs detected if these jobs "
+ "exist in your master CSV already."
+ )
+ return jobs_dict
+
+ def write_cache(self, jobs_dict: Dict[str, Job],
+ cache_file: str = None) -> None:
+ """Dump a jobs_dict into a pickle
+
+ TODO: write search_config into the cache file and jobfunnel version
+ TODO: some way to cache Job.RAW without hitting recursion limit
+
+ Args:
+ jobs_dict (Dict[str, Job]): jobs dict to dump into cache.
+ cache_file (str, optional): file path to write to. Defaults to None.
+ """
+ cache_file = cache_file if cache_file else self.daily_cache_file
+ for job in jobs_dict.values():
+ job._raw_scrape_data = None # pylint: disable=protected-access
+ pickle.dump(
+ {
+ 'version': __version__,
+ 'jobs_dict': jobs_dict,
+ },
+ open(cache_file, 'wb'),
+ )
+ self.logger.debug(
+ "Dumped %d jobs to %s", len(jobs_dict.keys()), cache_file
+ )
+
+ def read_master_csv(self) -> Dict[str, Job]:
+ """Read in the master-list CSV to a dict of unique Jobs
+
+ TODO: make blurb --> description and add short_description
+
+ Returns:
+ Dict[str, Job]: unique Job objects in the CSV
+ """
+ jobs_dict = {} # type: Dict[str, Job]
+ with open(self.config.master_csv_file, 'r', encoding='utf8',
+ errors='ignore') as csvfile:
+ for row in csv.DictReader(csvfile):
+
+ # NOTE: we are doing legacy support here with 'blurb' etc.
+ # In the future we should have an actual short description
+ if 'short_description' in row:
+ short_description = row['short_description']
+ else:
+ short_description = ''
+ post_date = datetime.strptime(row['date'], '%Y-%m-%d')
+
+ if 'scrape_date' in row:
+ scrape_date = datetime.strptime(
+ row['scrape_date'], '%Y-%m-%d'
+ )
+ else:
+ scrape_date = post_date
+
+ if 'raw' in row:
+ # NOTE: we should never see this because raw cant be in CSV
+ raw = row['raw']
+ else:
+ raw = None
+
+ # We need to convert from user statuses
+ status = None
+ if 'status' in row:
+ status_str = row['status'].strip()
+ for p_status in JobStatus:
+ if status_str.lower() == p_status.name.lower():
+ status = p_status
+ break
+ if not status:
+ self.logger.warning(
+ "Unknown status %s, setting to UNKNOWN", status_str
+ )
+ status = JobStatus.UNKNOWN
+
+ # NOTE: this is for legacy support:
+ locale = None
+ if 'locale' in row:
+ locale_str = row['locale'].strip()
+ for p_locale in Locale:
+ if locale_str.lower() == p_locale.name.lower():
+ locale = p_locale
+ break
+ if not locale:
+ self.logger.warning(
+ "Unknown locale %s, setting to UNKNOWN", locale_str
+ )
+ locale = locale.UNKNOWN
+
+ job = Job(
+ title=row['title'],
+ company=row['company'],
+ location=row['location'],
+ description=row['blurb'],
+ key_id=row['id'],
+ url=row['link'],
+ locale=locale,
+ query=row['query'],
+ status=status,
+ provider=row['provider'],
+ short_description=short_description,
+ post_date=post_date,
+ scrape_date=scrape_date,
+ raw=raw,
+ tags=row['tags'].split(','),
+ )
+ job.validate()
+ jobs_dict[job.key_id] = job
+
+ self.logger.debug(
+ "Read %d jobs from master-CSV: %s",
+ len(jobs_dict.keys()), self.config.master_csv_file
+ )
+ return jobs_dict
+
+ def write_master_csv(self, jobs: Dict[str, Job]) -> None:
+ """Write out our dict of unique Jobs to a CSV
+
+ Args:
+ jobs (Dict[str, Job]): Dict of unique Jobs, keyd by unique id's
+ """
+ with open(self.config.master_csv_file, 'w', encoding='utf8') as csvfile:
+ writer = csv.DictWriter(csvfile, fieldnames=CSV_HEADER)
+ writer.writeheader()
+ for job in jobs.values():
+ job.validate()
+ writer.writerow(job.as_row)
+ self.logger.debug(
+ "Wrote %d jobs to %s", len(jobs), self.config.master_csv_file,
+ )
+
+ def update_user_block_list(self) -> None:
+ """From data in master CSV file, add jobs with removeable statuses to
+ our configured user block list file and save (if any)
+
+ NOTE: adding jobs to block list will result in filter() removing them
+ from all scraped & cached jobs in the future (persistant).
+
+ Raises:
+ FileNotFoundError: if no master_jobs_dict is provided and master csv
+ file does not exist.
+ """
+
+ # Try to load from CSV if master_jobs_dict is un-set
+ if not self.master_jobs_dict:
+ if os.path.isfile(self.config.master_csv_file):
+ self.master_jobs_dict = self.read_master_csv()
+ else:
+ raise FileNotFoundError(
+ f"Cannot update {self.config.user_block_list_file} without "
+ f"{self.config.master_csv_file}"
+ )
+
+ # Add jobs from csv that need to be filtered away, if any + update self
+ n_jobs_added = 0
+ for job in self.master_jobs_dict.values():
+ if job.is_remove_status:
+ if job.key_id not in self.job_filter.user_block_jobs_dict:
+ n_jobs_added += 1
+ self.job_filter.user_block_jobs_dict[
+ job.key_id] = job.as_json_entry
+ self.logger.info(
+ "Added %s to %s",
+ job.key_id,
+ self.config.user_block_list_file
+ )
+ else:
+ # This could happen if we are somehow mishandling block list
+ self.logger.warning(
+ "Job %s has been set to a removable status and removed "
+ "from master CSV multiple times.", job.key_id
+ )
+
+ if n_jobs_added:
+ # Write out complete list with any additions from the masterlist
+ # NOTE: we use indent=4 so that it stays human-readable.
+ with open(self.config.user_block_list_file, 'w',
+ encoding='utf8') as outfile:
+ outfile.write(
+ json.dumps(
+ self.job_filter.user_block_jobs_dict,
+ indent=4,
+ sort_keys=True,
+ separators=(',', ': '),
+ ensure_ascii=False,
+ )
+ )
+
+ self.logger.info(
+ "Moved %d jobs into block-list due to removable statuses: %s",
+ n_jobs_added, self.config.user_block_list_file
+ )
+
+ def update_duplicates_file(self) -> None:
+ """Update duplicates filter file if we have a path and contents
+ TODO: this should be writing out DuplicatedJob objects and a version
+ so that we retain links to original jobs.
+ """
+ if self.config.duplicates_list_file:
+ if self.job_filter.duplicate_jobs_dict:
+
+ # Write out the changes NOTE: indent=4 is for human-readability
+ self.logger.debug("Extending existing duplicate jobs dict.")
+ with open(self.config.duplicates_list_file, 'w',
+ encoding='utf8') as outfile:
+ outfile.write(
+ json.dumps(
+ self.job_filter.duplicate_jobs_dict,
+ indent=4,
+ sort_keys=True,
+ separators=(',', ': '),
+ ensure_ascii=False,
+ )
+ )
+ else:
+ self.logger.debug(
+ "Current duplicate jobs dict is empty, no updates written."
+ )
+ else:
+ self.logger.warning(
+ "Duplicates will not be saved, no duplicates list "
+ "file set. Saving to a duplicates file will ensure "
+ "that jobs detected to be duplicates by contents persist."
+ )
diff --git a/jobfunnel/tools/__init__.py b/jobfunnel/backend/scrapers/__init__.py
similarity index 100%
rename from jobfunnel/tools/__init__.py
rename to jobfunnel/backend/scrapers/__init__.py
diff --git a/jobfunnel/backend/scrapers/base.py b/jobfunnel/backend/scrapers/base.py
new file mode 100644
index 00000000..44f72f08
--- /dev/null
+++ b/jobfunnel/backend/scrapers/base.py
@@ -0,0 +1,438 @@
+"""The base scraper class to be used for all web-scraping emitting Job objects
+Paul McInnis 2020
+"""
+import random
+from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from multiprocessing import Lock, Manager
+from time import sleep
+from typing import Any, Dict, List, Optional
+
+from bs4 import BeautifulSoup
+from requests import Session
+from requests.adapters import HTTPAdapter
+from tqdm import tqdm
+from urllib3.util import Retry
+
+from jobfunnel.backend import Job, JobStatus
+from jobfunnel.backend.tools import Logger
+from jobfunnel.backend.tools.delay import calculate_delays
+from jobfunnel.backend.tools.filters import JobFilter
+from jobfunnel.resources import (MAX_CPU_WORKERS, USER_AGENT_LIST, JobField,
+ Locale)
+
+# pylint: disable=using-constant-test,unused-import
+if False: # or typing.TYPE_CHECKING if python3.5.3+
+ from jobfunnel.config import JobFunnelConfigManager
+# pylint: enable=using-constant-test,unused-import
+
+
+class BaseScraper(ABC, Logger):
+ """Base scraper object, for scraping and filtering Jobs from a provider
+ """
+
+ def __init__(self, session: Session, config: 'JobFunnelConfigManager',
+ job_filter: JobFilter) -> None:
+ """Init
+
+ Args:
+ session (Session): session object used to make post and get requests
+ config (JobFunnelConfigManager): config containing all needed paths,
+ search proxy, delaying and other metadata.
+ job_filter (JobFilter): object for filtering incoming jobs using
+ various internal filters, including a content-matching tool.
+ NOTE: this runs-on-the-fly as well, and preempts un-promising
+ job scrapes to minimize session() usage.
+
+ Raises:
+ ValueError: if no Locale is configured in the JobFunnelConfigManager
+ """
+ # Inits
+ super().__init__(level=config.log_level, file_path=config.log_file)
+ self.job_filter = job_filter
+ self.session = session
+ self.config = config
+ if self.headers:
+ self.session.headers.update(self.headers)
+
+ # Elongate the retries TODO: make configurable
+ retry = Retry(connect=3, backoff_factor=0.5)
+ adapter = HTTPAdapter(max_retries=retry)
+ self.session.mount('http://', adapter)
+ self.session.mount('https://', adapter)
+
+ # Ensure that the locale we want to use matches the locale that the
+ # scraper was written to scrape in:
+ if self.config.search_config.locale != self.locale:
+ raise ValueError(
+ f"Attempting to use scraper designed for {self.locale.name} "
+ "when config indicates user is searching with "
+ f"{self.config.search_config.locale.name}"
+ )
+
+ # Ensure our properties satisfy constraints
+ self._validate_get_set()
+ self.thread_manager = Manager()
+
+ # Construct actions list which respects priority for scraping Jobs
+ self._actions_list = [(True, f) for f in self.job_get_fields]
+ self._actions_list += [(False, f) for f in self.job_set_fields if f
+ in self.high_priority_get_set_fields]
+ self._actions_list += [(False, f) for f in self.job_set_fields if f not
+ in self.high_priority_get_set_fields]
+
+ @property
+ def user_agent(self) -> str:
+ """Get a randomized user agent for this scraper
+ """
+ return random.choice(USER_AGENT_LIST)
+
+ @property
+ def job_init_kwargs(self) -> Dict[JobField, Any]:
+ """This is a helper property that stores a Dict of JobField : value that
+ we set defaults for when scraping. If the scraper fails to get/set these
+ we can fail back to the empty value from here.
+
+ i.e. JobField.POST_DATE defaults to today.
+ TODO: formalize the defaults for JobFields via Job.__init__(Jobfields...
+ """
+ return {
+ JobField.STATUS: JobStatus.NEW,
+ JobField.LOCALE: self.locale,
+ JobField.QUERY: self.config.search_config.query_string,
+ JobField.DESCRIPTION: '',
+ JobField.URL: '',
+ JobField.SHORT_DESCRIPTION: '',
+ JobField.RAW: None,
+ JobField.PROVIDER: self.__class__.__name__,
+ JobField.REMOTE: '',
+ JobField.WAGE: '',
+ }
+
+ @property
+ def min_required_job_fields(self) -> List[JobField]:
+ """If we dont get() or set() any of these fields, we will raise an
+ exception instead of continuing without that information.
+
+ NOTE: pointless to check for locale / provider / other defaults
+
+ Override if needed, but be aware that key_id should always be populated
+ along with URL or the user can do nothing with the result.
+ """
+ return [
+ JobField.TITLE, JobField.COMPANY, JobField.LOCATION,
+ JobField.KEY_ID, JobField.URL
+ ]
+
+ @property
+ def high_priority_get_set_fields(self) -> List[JobField]:
+ """These get() and/or set() fields will be populated first.
+
+ i.e we need the RAW populated before DESCRIPTION, so RAW should be high.
+ i.e. we need to get key_id before we set job.url, so key_id is high.
+
+ NOTE: override as needed.
+ """
+ return []
+
+ @property
+ @abstractmethod
+ def job_get_fields(self) -> List[JobField]:
+ """Call self.get(...) for the JobFields in this list when scraping a Job.
+
+ NOTE: these will be passed job listing soups, if you have data you need
+ to populate that exists in the Job.RAW (the soup from the listing's own
+ page), you should use job_set_fields.
+ """
+
+ @property
+ @abstractmethod
+ def job_set_fields(self) -> List[JobField]:
+ """Call self.set(...) for the JobFields in this list when scraping a Job
+
+ NOTE: You should generally set the job's own page as soup to RAW first
+ and then populate other fields from this soup, or from each-other here.
+ """
+
+ @property
+ @abstractmethod
+ def delayed_get_set_fields(self) -> List[JobField]:
+ """Delay execution when getting /setting any of these attributes of a
+ job.
+
+ TODO: handle this within an overridden self.session.get()
+ """
+
+ @property
+ @abstractmethod
+ def locale(self) -> Locale:
+ """The localization that this scraper was built for.
+
+ i.e. I am looking for jobs on the Canadian version of Indeed, and I
+ speak english, so I will have this return Locale.CANADA_ENGLISH
+
+ We will use this to put the right filters & scrapers together
+
+ NOTE: it is best to inherit this from BaseClass (btm. of file)
+ """
+
+ @property
+ @abstractmethod
+ def headers(self) -> Dict[str, str]:
+ """The Session headers for this scraper to be used with
+ requests.Session.headers.update()
+ """
+
+ def scrape(self) -> Dict[str, Job]:
+ """Scrape job source into a dict of unique jobs keyed by ID
+
+ Returns:
+ jobs (Dict[str, Job]): list of Jobs in a Dict keyed by job.key_id
+ """
+
+ # Get a list of job soups from the initial search results page
+ # These wont contain enough information to do more than initialize Job
+ try:
+ job_soups = self.get_job_soups_from_search_result_listings()
+ except Exception as err:
+ raise ValueError(
+ "Unable to extract jobs from initial search result page:\n\t"
+ f"{str(err)}"
+ )
+ n_soups = len(job_soups)
+ self.logger.info(
+ "Scraped %s job listings from search results pages", n_soups
+ )
+
+ # Init a Manager so we can control delaying
+ # this is assuming every job will incur one delayed session.get()
+ # NOTE pylint issue: https://github.com/PyCQA/pylint/issues/3313
+ delay_lock = self.thread_manager.Lock() # pylint: disable=no-member
+ threads = ThreadPoolExecutor(max_workers=MAX_CPU_WORKERS)
+
+ # Distribute work to N workers such that each worker is building one
+ # Job at a time, getting and setting all required attributes
+ jobs_dict = {} # type: Dict[str, Job]
+ try:
+ # Calculate delays for get/set calls per-job NOTE: only get/set
+ # calls in self.delayed_get_set_fields will be delayed.
+ # and it busy-waits.
+ delays = calculate_delays(n_soups, self.config.delay_config)
+ futures = []
+ for job_soup, delay in zip(job_soups, delays):
+ futures.append(
+ threads.submit(
+ self.scrape_job,
+ job_soup=job_soup,
+ delay=delay,
+ delay_lock=delay_lock,
+ )
+ )
+
+ # For each job-soup object, scrape the soup into a Job (w/o desc.)
+ for future in tqdm(as_completed(futures), total=n_soups):
+ job = future.result()
+ if job:
+ # Handle inter-scraped data duplicates by key.
+ # TODO: move this functionality into duplicates filter
+ if job.key_id in jobs_dict:
+ self.logger.error(
+ "Job %s and %s share duplicate key_id: %s",
+ job.title, jobs_dict[job.key_id].title, job.key_id
+ )
+ else:
+ jobs_dict[job.key_id] = job
+
+ finally:
+ # Cleanup
+ threads.shutdown()
+
+ return jobs_dict
+
+ # pylint: disable=no-member
+ def scrape_job(self, job_soup: BeautifulSoup, delay: float,
+ delay_lock: Optional[Lock] = None) -> Optional[Job]:
+ """Scrapes a search page and get a list of soups that will yield jobs
+ Arguments:
+ job_soup (BeautifulSoup): This is a soup object that your get/set
+ will use to perform the get/set action. It should be specific
+ to this job and not contain other job information.
+ delay (float): how long to delay getting/setting for certain
+ get/set calls while scraping data for this job.
+ delay_lock (Optional[Manager.Lock], optional): semaphore for
+ synchronizing respectful delaying across workers
+
+ NOTE: this will never raise an exception to prevent killing workers,
+ who are building jobs sequentially.
+
+ Returns:
+ Optional[Job]: job object constructed from the soup and localization
+ of class, returns None if scrape failed.
+ """
+ # Scrape the data for the post, requiring a minimum of info...
+ # NOTE: if we perform a self.session.get we may get respectfully delayed
+ job = None # type: Optional[Job]
+ job_init_kwargs = self.job_init_kwargs # NOTE: faster?
+ for is_get, field in self._actions_list:
+
+ # Break out immediately because we have failed a filterable
+ # condition with something we initialized while scraping.
+ if job and self.job_filter.filterable(job):
+ if self.job_filter.is_duplicate(job):
+ # NOTE: if we pre-empt scraping duplicates we cannot update
+ # the existing job listing with the new information!
+ # TODO: make this behaviour configurable? ('minimal-get' ?)
+ self.logger.debug(
+ "Scraped job %s has key_id in known duplicates list. "
+ "Continuing scrape of job to update existing job "
+ "attributes.",
+ job.key_id
+ )
+ else:
+ self.logger.debug(
+ "Cancelled scraping of %s, failed JobFilter",
+ job.key_id
+ )
+ break
+
+ # Respectfully delay if it's configured to do so.
+ if field in self.delayed_get_set_fields:
+ if delay_lock:
+ self.logger.debug("Delaying for %.4f", delay)
+ with delay_lock:
+ sleep(delay)
+ else:
+ sleep(delay)
+
+ try:
+ if is_get:
+ job_init_kwargs[field] = self.get(field, job_soup)
+ else:
+ if not job:
+ # Build initial job object + populate all the job
+ job = Job(**{
+ k.name.lower(): v for k, v
+ in job_init_kwargs.items()
+ })
+ self.set(field, job, job_soup)
+
+ except Exception as err:
+
+ # TODO: we should really dump the soup object to an XML file
+ # so that users encountering bugs can submit it and we can
+ # quickly fix any failing scraping.
+
+ if field in self.min_required_job_fields:
+ raise ValueError(
+ "Unable to scrape minimum-required job field: "
+ f"{field.name} Got error:{str(err)}. {job.url}"
+ )
+ else:
+ # Crash out gracefully so we can continue scraping.
+ self.logger.warning(
+ "Unable to scrape %s for job: %s. %s",
+ field.name.lower(),
+ err,
+ job.url,
+ )
+
+ # Validate job fields if we got something
+ if job:
+ try:
+ job.validate()
+ except Exception as err:
+ # Bad job scrapes can't take down execution!
+ self.logger.error("Job failed validation: %s", err)
+ return None
+
+ return job
+ # pylint: enable=no-member
+
+ @abstractmethod
+ def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
+ """Scrapes a job provider's response to a search query where we are
+ shown many job listings at once.
+
+ NOTE: the soups list returned by this method should contain enough
+ information to set your self.min_required_job_fields with get()
+
+ Returns:
+ List[BeautifulSoup]: list of jobs soups we can use to make a Job
+ """
+
+ @abstractmethod
+ def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
+ """Get a single job attribute from a soup object by JobField
+
+ i.e. if param is JobField.COMPANY --> scrape from soup --> return str
+ TODO: better way to handle ret type?
+ """
+
+ @abstractmethod
+ def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
+ """Set a single job attribute from a soup object by JobField
+
+ Use this to set Job attribs that rely on Job existing already
+ with the required minimum fields.
+
+ i.e. I can set() the Job.RAW to be the soup of it's own dedicated web
+ page (Job.URL), then I can set() my Job.DESCRIPTION from the Job.RAW
+ """
+
+ def _validate_get_set(self) -> None:
+ """Ensure the get/set actions cover all need attribs and dont intersect
+ """
+ set_job_get_fields = set(self.job_get_fields)
+ set_job_set_fields = set(self.job_set_fields)
+ all_set_get_fields = set(self.job_get_fields + self.job_set_fields)
+ set_min_fields = set(self.min_required_job_fields)
+
+ set_missing_req_fields = set_min_fields - all_set_get_fields
+ if set_missing_req_fields:
+ raise ValueError(
+ f"Scraper: {self.__class__.__name__} Job attributes: "
+ f"{set_missing_req_fields} are required and not implemented."
+ )
+
+ field_intersection = set_job_get_fields.intersection(set_job_set_fields)
+ if field_intersection:
+ raise ValueError(
+ f"Scraper: {self.__class__.__name__} Job attributes: "
+ f"{field_intersection} are implemented by both get() and set()!"
+ )
+ excluded_fields = [] # type: List[JobField]
+ for field in JobField:
+ # NOTE: we exclude status, locale, query, provider and scrape date
+ # because these are set without needing any scrape data.
+ # TODO: SHORT and RAW are not impl. rn. remove this check when impl.
+ if (field not in [JobField.STATUS, JobField.LOCALE, JobField.QUERY,
+ JobField.SCRAPE_DATE, JobField.PROVIDER,
+ JobField.SHORT_DESCRIPTION, JobField.RAW]
+ and field not in self.job_get_fields
+ and field not in self.job_set_fields):
+ excluded_fields.append(field)
+ if excluded_fields:
+ # NOTE: INFO level because this is OK, but ideally ppl see this
+ # so they are motivated to help and understand why stuff might
+ # be missing in the CSV
+ self.logger.info(
+ "No get() or set() will be done for Job attrs: %s",
+ [field.name for field in excluded_fields]
+ )
+
+
+# Just some basic localized scrapers, you can inherit these to set the locale.
+class BaseUSAEngScraper(BaseScraper):
+ """Localized scraper for USA English
+ """
+ @property
+ def locale(self) -> Locale:
+ return Locale.USA_ENGLISH
+
+
+class BaseCANEngScraper(BaseScraper):
+ """Localized scraper for Canada English
+ """
+ @property
+ def locale(self) -> Locale:
+ return Locale.CANADA_ENGLISH
diff --git a/jobfunnel/backend/scrapers/glassdoor.py b/jobfunnel/backend/scrapers/glassdoor.py
new file mode 100644
index 00000000..29c03d88
--- /dev/null
+++ b/jobfunnel/backend/scrapers/glassdoor.py
@@ -0,0 +1,352 @@
+"""Scraper for www.glassdoor.X
+FIXME: this is currently unable to get past page 1 of job results.
+"""
+import re
+from abc import abstractmethod
+from concurrent.futures import ThreadPoolExecutor, wait
+from math import ceil
+from typing import Any, Dict, List, Tuple, Union
+
+from bs4 import BeautifulSoup
+from requests import Session
+
+from jobfunnel.backend import Job
+from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper,
+ BaseUSAEngScraper)
+from jobfunnel.backend.tools import get_webdriver
+from jobfunnel.backend.tools.filters import JobFilter
+from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str
+from jobfunnel.resources import MAX_CPU_WORKERS, JobField
+
+# pylint: disable=using-constant-test,unused-import
+if False: # or typing.TYPE_CHECKING if python3.5.3+
+ from jobfunnel.config import JobFunnelConfigManager
+# pylint: enable=using-constant-test,unused-import
+
+
+MAX_GLASSDOOR_LOCATIONS_TO_RETURN = 10
+LOCATION_BASE_URL = 'https://www.glassdoor.co.in/findPopularLocationAjax.htm?'
+MAX_RESULTS_PER_GLASSDOOR_PAGE = 30
+GLASSDOOR_RADIUS_MAP = {
+ 0: 0,
+ 10: 6,
+ 20: 12,
+ 30: 19,
+ 50: 31,
+ 100: 62,
+ 200: 124,
+}
+
+
+class BaseGlassDoorScraper(BaseScraper):
+
+ def __init__(self, session: Session, config: 'JobFunnelConfigManager',
+ job_filter: JobFilter) -> None:
+ """Init that contains glassdoor specific stuff
+ """
+ super().__init__(session, config, job_filter)
+ self.max_results_per_page = MAX_RESULTS_PER_GLASSDOOR_PAGE
+ self.query = '-'.join(self.config.search_config.keywords)
+ # self.driver = get_webdriver() TODO: we can use this if-needed
+
+ @abstractmethod
+ def quantize_radius(self, radius: int) -> int:
+ """Get the glassdoor-quantized radius
+ """
+
+ @property
+ def job_get_fields(self) -> str:
+ """Call self.get(...) for the JobFields in this list when scraping a Job
+ """
+ return [
+ JobField.TITLE, JobField.COMPANY, JobField.LOCATION,
+ JobField.POST_DATE, JobField.URL, JobField.KEY_ID, JobField.WAGE,
+ ]
+
+ @property
+ def job_set_fields(self) -> str:
+ """Call self.set(...) for the JobFields in this list when scraping a Job
+ """
+ return [JobField.RAW, JobField.DESCRIPTION]
+
+ @property
+ def delayed_get_set_fields(self) -> str:
+ """Delay execution when getting /setting any of these attributes of a
+ job.
+
+ Override this as needed.
+ """
+ return [JobField.RAW]
+
+ @property
+ def headers(self) -> Dict[str, str]:
+ return{
+ 'accept': 'text/html,application/xhtml+xml,application/xml;'
+ 'q=0.9,image/webp,*/*;q=0.8',
+ 'accept-encoding': 'gzip, deflate, sdch, br',
+ 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
+ 'referer':
+ f'https://www.glassdoor.{self.config.search_config.domain}/',
+ 'upgrade-insecure-requests': '1',
+ 'user-agent': self.user_agent,
+ 'Cache-Control': 'no-cache',
+ 'Connection': 'keep-alive',
+ }
+
+ def get_search_url(self,
+ method='get') -> Union[str, Tuple[str, Dict[str,str]]]:
+ """Gets the glassdoor search url
+ NOTE: we this relies on your city, not the state / province!
+ """
+ # Form the location lookup request data
+ data = {
+ 'term': self.config.search_config.city,
+ 'maxLocationsToReturn': MAX_GLASSDOOR_LOCATIONS_TO_RETURN,
+ }
+
+ # Get the location id for search location
+ location_id = self.session.post(
+ LOCATION_BASE_URL, headers=self.headers, data=data
+ ).json()[0]['locationId']
+
+ if method == 'get':
+
+ # Form job search url
+ search = (
+ 'https://www.glassdoor.{}/Job/jobs.htm?clickSource=searchBtn'
+ '&sc.keyword={}&locT=C&locId={}&jobType=&radius={}'.format(
+ self.config.search_config.domain,
+ self.query,
+ location_id,
+ self.quantize_radius(self.config.search_config.radius),
+ )
+ )
+ return search
+
+ elif method == 'post':
+
+ # Form the job search url
+ search = (
+ f"https://www.glassdoor.{self.config.search_config.domain}"
+ "/Job/jobs.htm"
+ )
+
+ # Form the job search data
+ data = {
+ 'clickSource': 'searchBtn',
+ 'sc.keyword': self.query,
+ 'locT': 'C',
+ 'locId': location_id,
+ 'jobType': '',
+ 'radius':
+ self.quantize_radius(self.config.search_config.radius),
+ }
+
+ return search, data
+ else:
+
+ raise ValueError(f'No html method {method} exists')
+
+ def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
+ """Scrapes raw data from a job source into a list of job-soups
+
+ Returns:
+ List[BeautifulSoup]: list of jobs soups we can use to make Job init
+ """
+ # Get the search url
+ search_url, data = self.get_search_url(method='post')
+
+ # Get the search page result.
+ request_html = self.session.post(search_url, data=data)
+ soup_base = BeautifulSoup(request_html.text, self.config.bs4_parser)
+
+ # Parse total results, and calculate the # of pages needed
+ n_pages = self._get_num_search_result_pages(soup_base)
+ self.logger.info(
+ f"Found {n_pages} pages of search results for query={self.query}"
+ )
+
+ # Get the first page of job soups from the search results listings
+ job_soup_list = self._parse_job_listings_to_bs4(soup_base)
+
+ # Init threads & futures list FIXME: we should probably delay here too
+ threads = ThreadPoolExecutor(MAX_CPU_WORKERS)
+ try:
+ # Search the remaining pages to extract the list of job soups
+ # FIXME: we can't load page 2, it redirects to page 1.
+ # There is toast that shows to get email notifs that shows up if
+ # I click it myself, must be an event listener?
+ futures = []
+ if n_pages > 1:
+ for page in range(2, n_pages + 1):
+ futures.append(
+ threads.submit(
+ self._search_page_for_job_soups,
+ self._get_next_page_url(soup_base, page),
+ job_soup_list,
+ )
+ )
+
+ wait(futures) # wait for all scrape jobs to finish
+ finally:
+ threads.shutdown()
+
+ return job_soup_list
+
+ def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
+ """Get a single job attribute from a soup object by JobField
+ TODO: impl div class=compactStars value somewhere.
+ """
+ if parameter == JobField.TITLE:
+ # TODO: we should instead get what user sees in the
+ return soup.get('data-normalize-job-title')
+ elif parameter == JobField.COMPANY:
+ return soup.find(
+ 'div', attrs={'class', 'jobInfoItem jobEmpolyerName'}
+ ).text.strip()
+ elif parameter == JobField.LOCATION:
+ return soup.get('data-job-loc')
+ # FIXME: impl.
+ # elif parameter == JobField.TAGS:
+ # labels = soup.find_all('div', attrs={'class', 'jobLabel'})
+ # if labels:
+ # return [
+ # l.text.strip() for l in labels if l.text.strip() != 'New'
+ # ]
+ # else:
+ # return []
+ # FIXME: impl JobField.REMOTE
+ elif parameter == JobField.POST_DATE:
+ return calc_post_date_from_relative_str(
+ soup.find(
+ 'div', attrs={
+ 'class': 'd-flex align-items-end pl-std css-mi55ob'
+ }
+ ).text.strip()
+ )
+ elif parameter == JobField.WAGE:
+ # NOTE: most jobs don't have this so we wont raise a warning here
+ # and will fail silently instead
+ wage = soup.find('span', attrs={'class': 'gray salary'})
+ if wage is not None:
+ return wage.text.strip()
+ elif parameter == JobField.KEY_ID:
+ return soup.get('data-id')
+ elif parameter == JobField.URL:
+ part_url = soup.find(
+ 'div', attrs={'class', 'logoWrap'}
+ ).find('a').get('href')
+ return (
+ f'https://www.glassdoor.{self.config.search_config.domain}'
+ f'{part_url}'
+ )
+ else:
+ raise NotImplementedError(f"Cannot get {parameter.name}")
+
+ def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
+ """Set a single job attribute from a soup object by JobField
+ NOTE: Description has to get and should be respectfully delayed
+ """
+ if parameter == JobField.RAW:
+ job._raw_scrape_data = BeautifulSoup(
+ self.session.get(job.url).text, self.config.bs4_parser
+ )
+ elif parameter == JobField.DESCRIPTION:
+ assert job._raw_scrape_data
+ job.description = job._raw_scrape_data.find(
+ id='JobDescriptionContainer'
+ ).text.strip()
+ else:
+ raise NotImplementedError(f"Cannot set {parameter.name}")
+
+ def _search_page_for_job_soups(self, listings_page_url: str,
+ job_soup_list: List[BeautifulSoup]) -> None:
+ """Get a list of job soups from a glassdoor page, by loading the page.
+ NOTE: this makes GET requests and should be respectfully delayed.
+ """
+ self.logger.debug(f"Scraping listings page {listings_page_url}")
+ job_soup_list.extend(
+ self._parse_job_listings_to_bs4(
+ BeautifulSoup(
+ self.session.get(listings_page_url).text,
+ self.config.bs4_parser,
+ )
+ )
+ )
+
+ def _parse_job_listings_to_bs4(self, page_soup: BeautifulSoup
+ ) -> List[BeautifulSoup]:
+ """Parse a page of job listings HTML text into job soups
+ """
+ return page_soup.find_all('li', attrs={'class', 'jl'})
+
+ def _get_num_search_result_pages(self, soup_base: BeautifulSoup) -> int:
+ # scrape total number of results, and calculate the # pages needed
+ num_res = soup_base.find('p', attrs={'class', 'jobsCount'}).text.strip()
+ num_res = int(re.findall(r'(\d+)', num_res.replace(',', ''))[0])
+ return int(ceil(num_res / self.max_results_per_page))
+
+ def _get_next_page_url(self, soup_base: BeautifulSoup,
+ results_page_number: int) -> str:
+ """Construct the next page of search results from the initial search
+ results page BeautifulSoup.
+ """
+ part_url = soup_base.find(
+ 'li', attrs={'class', 'next'}
+ ).find('a').get('href')
+
+ assert part_url is not None, "Unable to find next page in listing soup!"
+
+ # Uses partial url to construct next page url
+ return re.sub(
+ r'_IP\d+\.',
+ f'_IP{results_page_number}.',
+ f'https://www.glassdoor.{self.config.search_config.domain}'
+ f'{part_url}',
+ )
+
+
+class GlassDoorScraperCANEng(BaseGlassDoorScraper, BaseCANEngScraper):
+
+ def quantize_radius(self, radius: int) -> int:
+ """Get a Canadian raduius (km)
+ FIXME: use numpy.digitize instead
+ """
+ if radius < 10:
+ radius = 0
+ elif 10 <= radius < 20:
+ radius = 10
+ elif 20 <= radius < 30:
+ radius = 20
+ elif 30 <= radius < 50:
+ radius = 30
+ elif 50 <= radius < 100:
+ radius = 50
+ elif 100 <= radius < 200:
+ radius = 100
+ elif radius >= 200:
+ radius = 200
+ return GLASSDOOR_RADIUS_MAP[radius]
+
+
+class GlassDoorScraperUSAEng(BaseGlassDoorScraper, BaseUSAEngScraper):
+
+ def quantize_radius(self, radius: int) -> int:
+ """Get a USA raduius (miles)
+ FIXME: use numpy.digitize instead
+ """
+ if radius < 5:
+ radius = 0
+ elif 5 <= radius < 10:
+ radius = 5
+ elif 10 <= radius < 15:
+ radius = 10
+ elif 15 <= radius < 25:
+ radius = 15
+ elif 25 <= radius < 50:
+ radius = 25
+ elif 50 <= radius < 100:
+ radius = 50
+ elif radius >= 100:
+ radius = 100
+ return GLASSDOOR_RADIUS_MAP[radius]
diff --git a/jobfunnel/backend/scrapers/indeed.py b/jobfunnel/backend/scrapers/indeed.py
new file mode 100644
index 00000000..5f11be34
--- /dev/null
+++ b/jobfunnel/backend/scrapers/indeed.py
@@ -0,0 +1,287 @@
+"""Scraper designed to get jobs from www.indeed.X
+"""
+import re
+from concurrent.futures import ThreadPoolExecutor, wait
+from math import ceil
+from typing import Any, Dict, List, Optional
+
+from bs4 import BeautifulSoup
+from requests import Session
+
+from jobfunnel.backend import Job
+from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper,
+ BaseUSAEngScraper)
+from jobfunnel.backend.tools.filters import JobFilter
+from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str
+from jobfunnel.resources import MAX_CPU_WORKERS, JobField
+
+# pylint: disable=using-constant-test,unused-import
+if False: # or typing.TYPE_CHECKING if python3.5.3+
+ from jobfunnel.config import JobFunnelConfigManager
+# pylint: enable=using-constant-test,unused-import
+
+ID_REGEX = re.compile(r'id=\"sj_([a-zA-Z0-9]*)\"')
+MAX_RESULTS_PER_INDEED_PAGE = 50
+
+
+class BaseIndeedScraper(BaseScraper):
+ """Scrapes jobs from www.indeed.X
+ """
+
+ def __init__(self, session: Session, config: 'JobFunnelConfigManager',
+ job_filter: JobFilter) -> None:
+ """Init that contains indeed specific stuff
+ """
+ super().__init__(session, config, job_filter)
+ self.max_results_per_page = MAX_RESULTS_PER_INDEED_PAGE
+ self.query = '+'.join(self.config.search_config.keywords)
+
+ @property
+ def job_get_fields(self) -> str:
+ """Call self.get(...) for the JobFields in this list when scraping a Job
+
+ Override this as needed.
+ """
+ return [
+ JobField.TITLE, JobField.COMPANY, JobField.LOCATION,
+ JobField.KEY_ID, JobField.TAGS, JobField.POST_DATE,
+ # JobField.WAGE, JobField.REMOTE
+ # TODO: wage and remote are available in listings sometimes
+ ]
+
+ @property
+ def job_set_fields(self) -> str:
+ """Call self.set(...) for the JobFields in this list when scraping a Job
+
+ NOTE: Since this passes the Job we are updating, the order of this list
+ matters if set fields rely on each-other.
+
+ Override this as needed.
+ """
+ return [JobField.RAW, JobField.URL, JobField.DESCRIPTION]
+
+ @property
+ def delayed_get_set_fields(self) -> str:
+ """Delay execution when getting /setting any of these attributes of a
+ job.
+
+ Override this as needed.
+ """
+ return [JobField.RAW]
+
+ @property
+ def high_priority_get_set_fields(self) -> List[JobField]:
+ """These get() and/or set() fields will be populated first.
+ """
+ return [JobField.URL]
+
+ @property
+ def headers(self) -> Dict[str, str]:
+ """Session header for indeed.X
+ """
+ return {
+ 'accept': 'text/html,application/xhtml+xml,application/xml;'
+ 'q=0.9,image/webp,*/*;q=0.8',
+ 'accept-encoding': 'gzip, deflate, sdch, br',
+ 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
+ 'referer':
+ f'https://www.indeed.{self.config.search_config.domain}/',
+ 'upgrade-insecure-requests': '1',
+ 'user-agent': self.user_agent,
+ 'Cache-Control': 'no-cache',
+ 'Connection': 'keep-alive'
+ }
+
+ def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
+ """Scrapes raw data from a job source into a list of job-soups
+
+ Returns:
+ List[BeautifulSoup]: list of jobs soups we can use to make Job init
+ """
+ # Get the search url
+ search_url = self._get_search_url()
+
+ # Parse total results, and calculate the # of pages needed
+ pages = self._get_num_search_result_pages(search_url)
+ self.logger.info(
+ "Found %d pages of search results for query=%s", pages, self.query
+ )
+
+ # Init list of job soups
+ job_soup_list = [] # type: List[Any]
+
+ # Init threads & futures list FIXME: we should probably delay here too
+ threads = ThreadPoolExecutor(max_workers=MAX_CPU_WORKERS)
+ try:
+ # Scrape soups for all the result pages containing many job listings
+ futures = []
+ for page in range(0, pages):
+ futures.append(
+ threads.submit(
+ self._get_job_soups_from_search_page, search_url, page,
+ job_soup_list
+ )
+ )
+
+ # Wait for all scrape jobs to finish
+ wait(futures)
+
+ finally:
+ threads.shutdown()
+
+ return job_soup_list
+
+ def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
+ """Get a single job attribute from a soup object by JobField
+ """
+ if parameter == JobField.TITLE:
+ return soup.find(
+ 'a', attrs={'data-tn-element': 'jobTitle'}
+ ).text.strip()
+ elif parameter == JobField.COMPANY:
+ return soup.find('span', attrs={'class': 'company'}).text.strip()
+ elif parameter == JobField.LOCATION:
+ return soup.find('span', attrs={'class': 'location'}).text.strip()
+ elif parameter == JobField.TAGS:
+ # tags may not be on page and that's ok.
+ table_soup = soup.find(
+ 'table', attrs={'class': 'jobCardShelfContainer'}
+ )
+ if table_soup:
+ return [
+ td.text.strip() for td in table_soup.find_all(
+ 'td', attrs={'class': 'jobCardShelfItem'}
+ )
+ ]
+ # elif parameter == JobField.REMOTE:
+ # TODO: Impl, this is available in listings as: ...
+ # elif parameter == JobField.WAGE:
+ # TODO: Impl, this is available as: ...
+ elif parameter == JobField.POST_DATE:
+ return calc_post_date_from_relative_str(
+ soup.find('span', attrs={'class': 'date'}).text.strip()
+ )
+ elif parameter == JobField.KEY_ID:
+ return ID_REGEX.findall(
+ str(
+ soup.find(
+ 'a', attrs={'class': 'sl resultLink save-job-link'}
+ )
+ )
+ )[0]
+ else:
+ raise NotImplementedError(f"Cannot get {parameter.name}")
+
+ def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
+ """Set a single job attribute from a soup object by JobField
+ NOTE: URL is high-priority, since we need it to get RAW.
+ """
+ if parameter == JobField.RAW:
+ job._raw_scrape_data = BeautifulSoup(
+ self.session.get(job.url).text, self.config.bs4_parser
+ )
+ elif parameter == JobField.DESCRIPTION:
+ assert job._raw_scrape_data
+ job.description = job._raw_scrape_data.find(
+ id='jobDescriptionText'
+ ).text.strip()
+ elif parameter == JobField.URL:
+ assert job.key_id
+ job.url = (
+ f"http://www.indeed.{self.config.search_config.domain}/"
+ f"viewjob?jk={job.key_id}"
+ )
+ else:
+ raise NotImplementedError(f"Cannot set {parameter.name}")
+
+ def _get_search_url(self, method: Optional[str] = 'get') -> str:
+ """Get the indeed search url from SearchTerms
+ TODO: use Enum for method instead of str.
+ """
+ if method == 'get':
+ # TODO: impl. &remotejob=.... string which allows for remote search
+ # i.e &remotejob=032b3046-06a3-4876-8dfd-474eb5e7ed11
+ return (
+ "https://www.indeed.{0}/jobs?q={1}&l={2}%2C+{3}&radius={4}&"
+ "limit={5}&filter={6}".format(
+ self.config.search_config.domain,
+ self.query,
+ self.config.search_config.city.replace(' ', '+',),
+ self.config.search_config.province_or_state.upper(),
+ self._quantize_radius(self.config.search_config.radius),
+ self.max_results_per_page,
+ int(self.config.search_config.return_similar_results)
+ )
+ )
+ elif method == 'post':
+ # TODO: implement post style for indeed.X
+ raise NotImplementedError()
+ else:
+ raise ValueError(f'No html method {method} exists')
+
+ def _quantize_radius(self, radius: int) -> int:
+ """Quantizes the user input radius to a valid radius value into:
+ 5, 10, 15, 25, 50, 100, and 200 kilometers or miles.
+ TODO: implement with numpy instead of if/else cases.
+ """
+ if radius < 5:
+ radius = 0
+ elif 5 <= radius < 10:
+ radius = 5
+ elif 10 <= radius < 15:
+ radius = 10
+ elif 15 <= radius < 25:
+ radius = 15
+ elif 25 <= radius < 50:
+ radius = 25
+ elif 50 <= radius < 100:
+ radius = 50
+ elif radius >= 100:
+ radius = 100
+ return radius
+
+ def _get_job_soups_from_search_page(self, search: str, page: str,
+ job_soup_list: List[BeautifulSoup]
+ ) -> None:
+ """Scrapes the indeed page for a list of job soups
+ NOTE: modifies the job_soup_list in-place
+ """
+ url = f'{search}&start={int(page * self.max_results_per_page)}'
+ job_soup_list.extend(
+ BeautifulSoup(
+ self.session.get(url).text, self.config.bs4_parser
+ ).find_all('div', attrs={'data-tn-component': 'organicJob'})
+ )
+
+ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
+ """Calculates the number of pages of job listings to be scraped.
+
+ i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs
+
+ Args:
+ max_pages: the maximum number of pages to be scraped.
+ Returns:
+ The number of pages to be scraped.
+ """
+ # Get the html data, initialize bs4 with lxml
+ request_html = self.session.get(search_url)
+ query_resp = BeautifulSoup(request_html.text, self.config.bs4_parser)
+ num_res = query_resp.find(id='searchCountPages').contents[0].strip()
+ num_res = int(re.findall(r'f (\d+) ', num_res.replace(',', ''))[0])
+ number_of_pages = int(ceil(num_res / self.max_results_per_page))
+ if max_pages == 0:
+ return number_of_pages
+ elif number_of_pages < max_pages:
+ return number_of_pages
+ else:
+ return max_pages
+
+
+class IndeedScraperCANEng(BaseIndeedScraper, BaseCANEngScraper):
+ """Scrapes jobs from www.indeed.ca
+ """
+
+
+class IndeedScraperUSAEng(BaseIndeedScraper, BaseUSAEngScraper):
+ """Scrapes jobs from www.indeed.com
+ """
diff --git a/jobfunnel/backend/scrapers/monster.py b/jobfunnel/backend/scrapers/monster.py
new file mode 100644
index 00000000..f3680687
--- /dev/null
+++ b/jobfunnel/backend/scrapers/monster.py
@@ -0,0 +1,312 @@
+"""Scrapers for www.monster.X
+"""
+import re
+from abc import abstractmethod
+from math import ceil
+from typing import Any, Dict, List, Optional
+
+from bs4 import BeautifulSoup
+from requests import Session
+
+from jobfunnel.backend import Job
+from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper,
+ BaseUSAEngScraper)
+from jobfunnel.backend.tools.filters import JobFilter
+from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str
+from jobfunnel.resources import JobField
+
+# pylint: disable=using-constant-test,unused-import
+if False: # or typing.TYPE_CHECKING if python3.5.3+
+ from jobfunnel.config import JobFunnelConfigManager
+# pylint: enable=using-constant-test,unused-import
+
+
+MAX_RESULTS_PER_MONSTER_PAGE = 25
+MONSTER_SIDEPANEL_TAG_ENTRIES = ['industries', 'job type'] # these --> Job.tags
+ID_REGEX = re.compile(
+ r'/((?:[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]'
+ r'{12})|\d+)'
+)
+
+
+class BaseMonsterScraper(BaseScraper):
+ """Scraper for www.monster.X
+
+ NOTE: I dont think it's possible to scrape REMOTE other than from desc.
+ as of aug 2020. -pm
+ """
+
+ def __init__(self, session: Session, config: 'JobFunnelConfigManager',
+ job_filter: JobFilter) -> None:
+ """Init that contains monster specific stuff
+ """
+ super().__init__(session, config, job_filter)
+ self.query = '-'.join(
+ self.config.search_config.keywords
+ ).replace(' ', '-')
+
+ @property
+ def job_get_fields(self) -> str:
+ """Call self.get(...) for the JobFields in this list when scraping a Job
+ """
+ return [
+ JobField.KEY_ID, JobField.TITLE, JobField.COMPANY,
+ JobField.LOCATION, JobField.POST_DATE, JobField.URL,
+ ]
+
+ @property
+ def job_set_fields(self) -> str:
+ """Call self.set(...) for the JobFields in this list when scraping a Job
+ """
+ return [JobField.RAW, JobField.DESCRIPTION, JobField.TAGS]
+
+ @property
+ def high_priority_get_set_fields(self) -> List[JobField]:
+ """We need to populate these fields first
+ """
+ return [JobField.RAW, JobField.KEY_ID]
+
+ @property
+ def delayed_get_set_fields(self) -> str:
+ """Delay execution when getting /setting any of these attributes of a
+ job.
+
+ Override this as needed.
+ """
+ return [JobField.RAW]
+
+ @property
+ def headers(self) -> Dict[str, str]:
+ """Session header for monster.X
+ """
+ return {
+ 'accept': 'text/html,application/xhtml+xml,application/xml;'
+ 'q=0.9,image/webp,*/*;q=0.8',
+ 'accept-encoding': 'gzip, deflate, sdch, br',
+ 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
+ 'referer':
+ f'https://www.monster.{self.config.search_config.domain}/',
+ 'upgrade-insecure-requests': '1',
+ 'user-agent': self.user_agent,
+ 'Cache-Control': 'no-cache',
+ 'Connection': 'keep-alive'
+ }
+
+ def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
+ """Get a single job attribute from a soup object by JobField
+ NOTE: priority is all the same.
+ """
+ if parameter == JobField.KEY_ID:
+ # TODO: is there a way to combine these calls?
+ # NOTE: do not use 'data-m_impr_j_jobid' as this is duplicated
+ return soup.find('h2', attrs={'class': 'title'}).find('a').get(
+ 'data-m_impr_j_postingid'
+ )
+ elif parameter == JobField.TITLE:
+ return soup.find('h2', attrs={'class': 'title'}).text.strip()
+ elif parameter == JobField.COMPANY:
+ return soup.find('div', attrs={'class': 'company'}).text.strip()
+ elif parameter == JobField.LOCATION:
+ return soup.find('div', attrs={'class': 'location'}).text.strip()
+ elif parameter == JobField.POST_DATE:
+ return calc_post_date_from_relative_str(
+ soup.find('time').text.strip()
+ )
+ elif parameter == JobField.URL:
+ # NOTE: seems that it is a bit hard to view these links? getting 503
+ return str(
+ soup.find('a', attrs={'data-bypass': 'true'}).get('href')
+ )
+ else:
+ raise NotImplementedError(f"Cannot get {parameter.name}")
+
+ def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
+ """Set a single job attribute from a soup object by JobField
+ NOTE: priority is: HIGH: RAW, LOW: DESCRIPTION / TAGS
+ """
+ if parameter == JobField.RAW:
+ job._raw_scrape_data = BeautifulSoup(
+ self.session.get(job.url).text, self.config.bs4_parser
+ )
+ elif parameter == JobField.DESCRIPTION:
+ assert job._raw_scrape_data
+ job.description = job._raw_scrape_data.find(
+ id='JobDescription'
+ ).text.strip()
+ elif parameter == JobField.TAGS:
+ # NOTE: this seems a bit flimsy, monster allows a lot of flex. here
+ assert job._raw_scrape_data
+ tags = [] # type: List[str]
+ for li in job._raw_scrape_data.find_all(
+ 'section', attrs={'class': 'summary-section'}):
+ table_key = li.find('dt')
+ if (table_key and table_key.text.strip().lower()
+ in MONSTER_SIDEPANEL_TAG_ENTRIES):
+ table_value = li.find('dd')
+ if table_value:
+ tags.append(table_value.text.strip())
+ else:
+ raise NotImplementedError(f"Cannot set {parameter.name}")
+
+ def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
+ """Scrapes raw data from a job source into a list of job-soups
+
+ TODO: use threading here too
+
+ Returns:
+ List[BeautifulSoup]: list of jobs soups we can use to make Job init
+ """
+ # Get the search url
+ search_url = self._get_search_url()
+
+ # Load our initial search results listings page
+ initial_search_results_html = self.session.get(search_url)
+ initial_search_results_soup = BeautifulSoup(
+ initial_search_results_html.text, self.config.bs4_parser
+ )
+
+ # Parse total results, and calculate the # of pages needed
+ n_pages = self._get_num_search_result_pages(initial_search_results_soup)
+ self.logger.info(
+ "Found %d pages of search results for query=%s", n_pages, self.query
+ )
+
+ # Get first page of listing soups from our search results listings page
+ # NOTE: Monster is an endless-scroll style of job site so we have to
+ # Remove previous pages as we go.
+ # TODO: better error handling here?
+ # TODO: maybe we can move this into get set / BaseScraper somehow?
+ def __get_job_soups_by_key_id(result_listings: BeautifulSoup
+ ) -> Dict[str, BeautifulSoup]:
+ return {
+ self.get(JobField.KEY_ID, job_soup): job_soup
+ for job_soup in self._get_job_soups_from_search_page(
+ result_listings
+ )
+ }
+
+ job_soups_dict = __get_job_soups_by_key_id(initial_search_results_soup)
+
+ # Get all the other pages
+ if n_pages > 1:
+ for page in range(2, n_pages):
+ next_listings_page_soup = BeautifulSoup(
+ self.session.get(self._get_search_url(page=page)).text,
+ self.config.bs4_parser,
+ )
+ # Add only the jobs that we didn't 'scroll' past already
+ job_soups_dict.update(
+ __get_job_soups_by_key_id(next_listings_page_soup)
+ )
+
+ # TODO: would be cool if we could avoid key_id scrape duplication in get
+ return list(job_soups_dict.values())
+
+ def _get_job_soups_from_search_page(self,
+ initial_results_soup: BeautifulSoup,
+ ) -> List[BeautifulSoup]:
+ """Get individual job listing soups from a results page of many jobs
+ """
+ return initial_results_soup.find_all('div', attrs={'class': 'flex-row'})
+
+ def _get_num_search_result_pages(self, initial_results_soup: BeautifulSoup,
+ ) -> int:
+ """Calculates the number of pages of job listings to be scraped.
+
+ i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs
+
+ Args:
+ initial_results_soup: the soup for the first search results page
+ Returns:
+ The number of pages of job listings to be scraped.
+ """
+ # scrape total number of results, and calculate the # pages needed
+ partial = initial_results_soup.find('h2', 'figure').text.strip()
+ assert partial, "Unable to identify number of search results"
+ num_res = int(re.findall(r'(\d+)', partial)[0])
+ return int(ceil(num_res / MAX_RESULTS_PER_MONSTER_PAGE))
+
+ def _get_search_url(self, method: Optional[str] = 'get',
+ page: int = 1) -> str:
+ """Get the monster search url from SearchTerms
+ TODO: implement fulltime/part-time portion + company search?
+ TODO: implement POST
+ NOTE: unfortunately we cannot start on any page other than 1,
+ so the jobs displayed just scrolls forever and we will see
+ all previous jobs as we go.
+ """
+ if method == 'get':
+ return (
+ 'https://www.monster.{}/jobs/search/?{}q={}&where={}__2C-{}'
+ '&rad={}'.format(
+ self.config.search_config.domain,
+ f'page={page}&' if page > 1 else '',
+ self.query,
+ self.config.search_config.city.replace(' ', '-'),
+ self.config.search_config.province_or_state,
+ self._convert_radius(self.config.search_config.radius)
+ )
+ )
+ elif method == 'post':
+ raise NotImplementedError()
+ else:
+ raise ValueError(f'No html method {method} exists')
+
+ @abstractmethod
+ def _convert_radius(self, radius: int) -> int:
+ """NOTE: radius conversion is units/locale specific
+ """
+
+class MonsterScraperCANEng(BaseMonsterScraper, BaseCANEngScraper):
+ """Scrapes jobs from www.monster.ca
+ """
+ def _convert_radius(self, radius: int) -> int:
+ """convert radius in miles TODO replace with numpy
+ """
+ if radius < 5:
+ radius = 0
+ elif 5 <= radius < 10:
+ radius = 5
+ elif 10 <= radius < 20:
+ radius = 10
+ elif 20 <= radius < 50:
+ radius = 20
+ elif 50 <= radius < 100:
+ radius = 50
+ elif radius >= 100:
+ radius = 100
+ return radius
+
+
+class MonsterScraperUSAEng(BaseMonsterScraper, BaseUSAEngScraper):
+ """Scrapes jobs from www.monster.com
+ """
+
+ def _convert_radius(self, radius: int) -> int:
+ """convert radius in miles TODO replace with numpy
+ """
+ if radius < 5:
+ radius = 0
+ elif 5 <= radius < 10:
+ radius = 5
+ elif 10 <= radius < 20:
+ radius = 10
+ elif 20 <= radius < 30:
+ radius = 20
+ elif 30 <= radius < 40:
+ radius = 30
+ elif 40 <= radius < 50:
+ radius = 40
+ elif 50 <= radius < 60:
+ radius = 50
+ elif 60 <= radius < 75:
+ radius = 60
+ elif 75 <= radius < 100:
+ radius = 75
+ elif 100 <= radius < 150:
+ radius = 100
+ elif 150 <= radius < 200:
+ radius = 150
+ elif radius >= 200:
+ radius = 200
+ return radius
diff --git a/jobfunnel/backend/scrapers/registry.py b/jobfunnel/backend/scrapers/registry.py
new file mode 100644
index 00000000..12ef742d
--- /dev/null
+++ b/jobfunnel/backend/scrapers/registry.py
@@ -0,0 +1,32 @@
+"""Lookup tables where we can map scrapers to locales, etc
+
+NOTE: if you implement a scraper you must add it here
+TODO: there must be a better way to do this by using class attrib of Provider
+"""
+from jobfunnel.resources import Locale, Provider
+
+from jobfunnel.backend.scrapers.indeed import (
+ IndeedScraperCANEng, IndeedScraperUSAEng,
+)
+from jobfunnel.backend.scrapers.monster import (
+ MonsterScraperCANEng, MonsterScraperUSAEng,
+)
+from jobfunnel.backend.scrapers.glassdoor import (
+ GlassDoorScraperCANEng, GlassDoorScraperUSAEng,
+)
+
+SCRAPER_FROM_LOCALE = {
+ # search terms which one to use
+ Provider.INDEED: {
+ Locale.CANADA_ENGLISH: IndeedScraperCANEng,
+ Locale.USA_ENGLISH: IndeedScraperUSAEng,
+ },
+ Provider.GLASSDOOR: {
+ Locale.CANADA_ENGLISH: GlassDoorScraperCANEng,
+ Locale.USA_ENGLISH: GlassDoorScraperUSAEng,
+ },
+ Provider.MONSTER: {
+ Locale.CANADA_ENGLISH: MonsterScraperCANEng,
+ Locale.USA_ENGLISH: MonsterScraperUSAEng,
+ },
+}
diff --git a/jobfunnel/backend/tools/__init__.py b/jobfunnel/backend/tools/__init__.py
new file mode 100644
index 00000000..13c46753
--- /dev/null
+++ b/jobfunnel/backend/tools/__init__.py
@@ -0,0 +1,2 @@
+from jobfunnel.backend.tools.tools import get_webdriver, get_logger, Logger
+# NOTE: we can't import delays here or we cause circular import.
diff --git a/jobfunnel/backend/tools/delay.py b/jobfunnel/backend/tools/delay.py
new file mode 100644
index 00000000..2ee45cbb
--- /dev/null
+++ b/jobfunnel/backend/tools/delay.py
@@ -0,0 +1,116 @@
+"""Module for calculating random or non-random delay
+"""
+from math import ceil, log, sqrt
+from random import uniform
+from typing import List, Union
+
+from numpy import arange
+from scipy.special import expit # pylint: disable=no-name-in-module
+
+from jobfunnel.config import DelayConfig
+from jobfunnel.resources import DelayAlgorithm
+
+
+def _c_delay(list_len: int, delay: Union[int, float]):
+ """Sets single delay value to whole list.
+ """
+ delays = [delay] * list_len
+ # sets incrementing offsets to the first 8 elements
+ inc = .2 # Increment set to .2
+ offset = len(delays[0:8]) / 5 # offset
+ # checks if delay is < 1.5
+ if delay < 1.5:
+ # changes increment and offset, to prevent 0s and negative nums
+ inc = delay / 8
+ offset = float(len(delays[0:8])) * inc
+ # division here is faster since they are both ints
+ delays[0:8] = [(x - offset) + i * inc for i, x in enumerate(delays[0:8])]
+ return delays
+
+
+def _lin_delay(list_len: int, delay: Union[int, float]):
+ """Calculates y=.2*x and sets y=delay at intersection of x between lines.
+ """
+ # calculates x value where lines intersect
+ its = 5 * delay # its = intersection
+ # any delay of .2 or less is hard delay
+ if its <= 1:
+ return _c_delay(list_len, delay)
+ else:
+ # prevents slicing from breaking if delay is a float
+ if isinstance(its, float):
+ its = int(ceil(its))
+ # create list of x values based on scrape list size
+ delays = [*range(list_len)]
+ delays[0:its] = [x / 5 for x in delays[0:its]]
+ delays[its:] = [delay] * (len(delays) - its)
+ return delays
+
+
+def _sig_delay(list_len: int, delay: Union[int, float]):
+ """Calculates Richards/Sigmoid curve for delay.
+ NOTE: https://en.wikipedia.org/wiki/Generalised_logistic_function
+ """
+ gr = sqrt(delay) * 4 # growth rate
+ y_0 = log(4 * delay) # Y(0)
+ # calculates sigmoid curve using vars rewritten to be our x
+ delays = delay * expit(arange(list_len) / gr - y_0)
+ return delays.tolist() # convert np array back to list
+
+
+def calculate_delays(list_len: int, delay_config: DelayConfig) -> List[float]:
+ """Checks delay config and returns calculated delay list.
+
+ NOTE: we do this to be respectful to online job sources
+ TODO: we should be able to calculate delays on-demand.
+
+ Args:
+ list_len: length of scrape job list
+ delay_config: Delay configuration dictionary
+
+ Returns:
+ list of delay time matching length of scrape job list
+ """
+ delay_config.validate()
+
+ # Delay calculations using specified equations
+ if delay_config.algorithm == DelayAlgorithm.CONSTANT:
+ delay_vals = _c_delay(list_len, delay_config.max_duration)
+ elif delay_config.algorithm == DelayAlgorithm.LINEAR:
+ delay_vals = _lin_delay(list_len, delay_config.max_duration)
+ elif delay_config.algorithm == DelayAlgorithm.SIGMOID:
+ delay_vals = _sig_delay(list_len, delay_config.max_duration)
+ else:
+ raise ValueError(f"Cannot calculate delay for {delay_config.algorithm}")
+
+ # Check if minimum delay is above 0 and less than last element
+ if delay_config.min_duration > 0:
+ # sets min_duration to values greater than itself in delay_vals
+ for i, n in enumerate(delay_vals):
+ if n > delay_config.min_duration:
+ break
+ delay_vals[i] = delay_config.min_duration
+
+ # Outputs final list of delays rounded up to 3 decimal places
+ if delay_config.random: # check if random delay was specified
+ # random.uniform(a, b) a = lower bound, b = upper bound
+ if delay_config.converge: # checks if converging delay is True
+ # delay_vals = lower bound, delay = upper bound
+ durations = [
+ round(uniform(x, delay_config.max_duration), 3)
+ for x in delay_vals
+ ]
+ else:
+ # lb = lower bounds, delay_vals = upper bound
+ durations = [
+ round(uniform(delay_config.min_duration, x), 3)
+ for x in delay_vals
+ ]
+
+ else:
+ durations = [round(i, 3) for i in delay_vals]
+
+ # Always set first element to 0 so scrape starts right away
+ durations[0] = 0.0
+
+ return durations
diff --git a/jobfunnel/backend/tools/filters.py b/jobfunnel/backend/tools/filters.py
new file mode 100644
index 00000000..b6d25437
--- /dev/null
+++ b/jobfunnel/backend/tools/filters.py
@@ -0,0 +1,358 @@
+"""Filters that are used in jobfunnel's filter() method or as intermediate
+filters to reduce un-necessesary scraping
+Paul McInnis 2020
+"""
+import logging
+from collections import namedtuple
+from copy import deepcopy
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple
+
+import nltk
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+from jobfunnel.backend import Job
+from jobfunnel.backend.tools import Logger
+from jobfunnel.resources import (DEFAULT_MAX_TFIDF_SIMILARITY,
+ MIN_JOBS_TO_PERFORM_SIMILARITY_SEARCH,
+ DuplicateType)
+
+DuplicatedJob = namedtuple(
+ 'DuplicatedJob', ['original', 'duplicate', 'type'],
+)
+
+
+class JobFilter(Logger):
+ """Class Used by JobFunnel and BaseScraper to filter collections of jobs
+
+ TODO: make more configurable, maybe with a FilterBank class.
+ """
+
+ def __init__(self, user_block_jobs_dict: Optional[Dict[str, str]] = None,
+ duplicate_jobs_dict: Optional[Dict[str, str]] = None,
+ blocked_company_names_list: Optional[List[str]] = None,
+ max_job_date: Optional[datetime] = None,
+ max_similarity: float = DEFAULT_MAX_TFIDF_SIMILARITY,
+ min_tfidf_corpus_size:
+ int = MIN_JOBS_TO_PERFORM_SIMILARITY_SEARCH,
+ log_level: int = logging.INFO,
+ log_file: str = None) -> None:
+ """Init
+
+ TODO: need a config for this
+
+ Args:
+ user_block_jobs_dict (Optional[Dict[str, str]], optional): dict
+ containing user's blocked jobs. Defaults to None.
+ duplicate_jobs_dict (Optional[Dict[str, str]], optional): dict
+ containing duplicate jobs, detected by content. Defaults to None
+ blocked_company_names_list (Optional[List[str]], optional): list of
+ company names disallowed from results. Defaults to None.
+ max_job_date (Optional[datetime], optional): maximium date that a
+ job can be scraped. Defaults to None.
+ log_level (Optional[int], optional): log level. Defaults to INFO.
+ log_file (Optional[str], optional): log file, Defaults to None.
+ """
+ super().__init__(
+ level=log_level,
+ file_path=log_file,
+ )
+ self.user_block_jobs_dict = user_block_jobs_dict or {}
+ self.duplicate_jobs_dict = duplicate_jobs_dict or {}
+ self.blocked_company_names_list = blocked_company_names_list or []
+ self.max_job_date = max_job_date
+ self.max_similarity = max_similarity
+ self.min_tfidf_corpus_size = min_tfidf_corpus_size
+
+ # Retrieve stopwords if not already downloaded
+ try:
+ stopwords = nltk.corpus.stopwords.words('english')
+ except LookupError:
+ nltk.download('stopwords', quiet=True)
+ stopwords = nltk.corpus.stopwords.words('english')
+
+ # Init vectorizer
+ self.vectorizer = TfidfVectorizer(
+ strip_accents='unicode',
+ lowercase=True,
+ analyzer='word',
+ stop_words=stopwords,
+ )
+
+ def filter(self, jobs_dict: Dict[str, Job],
+ remove_existing_duplicate_keys: bool = True) -> Dict[str, Job]:
+ """Filter jobs that fail numerous tests, possibly including duplication
+
+ Arguments:
+ remove_existing_duplicate_keys: pass True to remove jobs if their
+ ID was previously detected to be a duplicate via TFIDF cosine
+ similarity
+
+ NOTE: if you remove duplicates before processesing them into updates
+ you will retain potentially stale job information.
+
+ Returns:
+ jobs_dict with all filtered items removed.
+ """
+ return {
+ key_id: job for key_id, job in jobs_dict.items()
+ if not self.filterable(
+ job, check_existing_duplicates=remove_existing_duplicate_keys
+ )
+ }
+
+ def filterable(self, job: Job,
+ check_existing_duplicates: bool = True) -> bool:
+ """Filter jobs out using all our available filters
+
+ NOTE: this allows job to be partially initialized
+
+ Arguments:
+ check_existing_duplicates: pass True to check if ID was previously
+ detected to be a duplicate via TFIDF cosine similarity
+
+ Returns:
+ True if the job should be removed from incoming data, else False
+ """
+ return bool(
+ job.status and job.is_remove_status
+ or (job.company in self.blocked_company_names_list)
+ or (job.post_date and self.max_job_date
+ and job.is_old(self.max_job_date))
+ or (job.key_id and self.user_block_jobs_dict
+ and job.key_id in self.user_block_jobs_dict)
+ or (check_existing_duplicates and self.is_duplicate(job))
+ )
+
+ def is_duplicate(self, job: Job) -> bool:
+ """Return true if passed Job has key_id and it is in our duplicates list
+ """
+ return bool(job.key_id and self.duplicate_jobs_dict
+ and job.key_id in self.duplicate_jobs_dict)
+
+ def find_duplicates(self, existing_jobs_dict: Dict[str, Job],
+ incoming_jobs_dict: Dict[str, Job],
+ ) -> List[DuplicatedJob]:
+ """Remove all known duplicates from jobs_dict and update original data
+
+ TODO: find duplicates by content within existing jobs
+
+ Args:
+ existing_jobs_dict (Dict[str, Job]): dict of jobs keyed by key_id.
+ incoming_jobs_dict (Dict[str, Job]): dict of new jobs by key_id.
+
+ Returns:
+ Dict[str, Job]: jobs dict with all jobs keyed by known-duplicate
+ key_ids removed, and their originals updated.
+ """
+ duplicate_jobs_list = [] # type: List[DuplicatedJob]
+ filt_existing_jobs_dict = deepcopy(existing_jobs_dict)
+ filt_incoming_jobs_dict = {} # type: Dict[str, Job]
+
+ # Look for matches by key id only
+ for key_id, incoming_job in incoming_jobs_dict.items():
+
+ # The key-ids are a direct match between existing and new
+ if key_id in existing_jobs_dict:
+ self.logger.debug(
+ f"Identified duplicate {key_id} between incoming data "
+ "and existing data."
+ )
+ duplicate_jobs_list.append(
+ DuplicatedJob(
+ original=existing_jobs_dict[key_id],
+ duplicate=incoming_job,
+ type=DuplicateType.KEY_ID,
+ )
+ )
+
+ # The key id is a known-duplicate we detected via content match
+ # NOTE: original and duplicate have the same key id.
+ elif key_id in self.duplicate_jobs_dict:
+ self.logger.debug(
+ f"Identified existing content-matched duplicate {key_id} "
+ "in incoming data."
+ )
+ duplicate_jobs_list.append(
+ DuplicatedJob(
+ original=None, # TODO: load ref from duplicates dict
+ duplicate=incoming_job,
+ type=DuplicateType.EXISTING_TFIDF,
+ )
+ )
+ else:
+ # This key_id is not duplicate, we can use it for TFIDF
+ filt_incoming_jobs_dict[key_id] = deepcopy(incoming_job)
+
+ # Run the tfidf vectorizer if we have enough jobs left after removing
+ # key duplicates
+ if (len(filt_incoming_jobs_dict.keys())
+ + len(filt_existing_jobs_dict.keys()) < self.min_tfidf_corpus_size):
+ self.logger.warning(
+ "Skipping content-similarity filter because there are fewer than "
+ f"{self.min_tfidf_corpus_size} jobs."
+ )
+ elif filt_incoming_jobs_dict:
+ duplicate_jobs_list.extend(
+ self.tfidf_filter(
+ incoming_jobs_dict=filt_incoming_jobs_dict,
+ existing_jobs_dict=filt_existing_jobs_dict,
+ )
+ )
+ else:
+ self.logger.warning(
+ "Skipping content-similarity filter because there are no "
+ "incoming jobs"
+ )
+
+ # Update duplicates list with more JSON-friendly entries
+ # TODO: we should retain a reference to the original job's contents
+ self.duplicate_jobs_dict.update({
+ j.duplicate.key_id: j.duplicate.as_json_entry
+ for j in duplicate_jobs_list
+ })
+
+ return duplicate_jobs_list
+
+ def tfidf_filter(self, incoming_jobs_dict: Dict[str, dict],
+ existing_jobs_dict: Dict[str, dict],
+ ) -> List[DuplicatedJob]:
+ """Fit a tfidf vectorizer to a corpus of Job.DESCRIPTIONs and identify
+ duplicate jobs by cosine-similarity.
+
+ NOTE/WARNING: if you are running this method, you should have already
+ removed any duplicates by key_id
+ NOTE: this only uses job descriptions to do the content matching.
+ NOTE: it is recommended that you have at least around 25 ish Jobs.
+ TODO: need to handle existing_jobs_dict = None
+ TODO: have this raise an exception if there are too few words.
+ TODO: we should consider caching the transformed corpus.
+
+ Args:
+ incoming_jobs_dict (Dict[str, dict]): dict of jobs containing
+ potential duplicates (i.e jobs we just scraped)
+ existing_jobs_dict (Dict[str, dict]): the existing jobs dict
+ (i.e. Master CSV)
+
+ Raises:
+ ValueError: incoming_jobs_dict contains no job descriptions
+
+ Returns:
+ List[DuplicatedJob]: list of new duplicate Jobs and their existing
+ Jobs found via content matching (for use in JobFunnel).
+ """
+ def __dict_to_ids_and_words(jobs_dict: Dict[str, Job],
+ is_incoming: bool = False,
+ ) -> Tuple[List[str], List[str]]:
+ """Get query words and ids as lists + prefilter
+ NOTE: this is just a convenience method since we do this 2x
+ TODO: consider moving this once/if we change iteration
+ """
+ ids = [] # type: List[str]
+ words = [] # type: List[str]
+ filt_job_dict = {} # type: Dict[str, Job]
+ for job in jobs_dict.values():
+ if is_incoming and job.key_id in self.duplicate_jobs_dict:
+ # NOTE: we should never see this for incoming jobs.
+ # we will see it for existing jobs since duplicates can
+ # share a key_id.
+ raise ValueError(
+ "Attempting to run TFIDF with existing duplicate "
+ f"{job.key_id}"
+ )
+ elif not len(job.description):
+ self.logger.debug(
+ f"Removing {job.key_id} from scrape result, empty "
+ "description."
+ )
+ else:
+ ids.append(job.key_id)
+ words.append(job.description)
+ # NOTE: We want to leave changing incoming_jobs_dict in
+ # place till the end or we will break usage of
+ # Job.update_if_newer()
+ filt_job_dict[job.key_id] = job
+
+ # TODO: assert on length of contents of the lists as well
+ if not words:
+ raise ValueError(
+ "No data to fit, are your job descriptions all empty?"
+ )
+ return ids, words, filt_job_dict
+
+ query_ids, query_words, filt_incoming_jobs_dict = \
+ __dict_to_ids_and_words(incoming_jobs_dict, is_incoming=True)
+
+ # Calculate corpus and format query data for TFIDF calculation
+ corpus = [] # type: List[str]
+ if existing_jobs_dict:
+ self.logger.debug("Running TFIDF on incoming vs existing data.")
+ reference_ids, reference_words, filt_existing_jobs_dict = \
+ __dict_to_ids_and_words(existing_jobs_dict, is_incoming=False)
+ corpus = query_words + reference_words
+ else:
+ self.logger.debug("Running TFIDF on incoming data only.")
+ reference_ids = query_ids,
+ reference_words = query_words
+ filt_existing_jobs_dict = filt_incoming_jobs_dict
+ corpus = query_words
+
+ # Provide a warning if we have few words.
+ # TODO: warning should reflect actual corpus size
+ if len(corpus) < self.min_tfidf_corpus_size:
+ self.logger.warning(
+ "It is not recommended to use this filter with less than "
+ f"{self.min_tfidf_corpus_size} jobs"
+ )
+
+ # Fit vectorizer to entire corpus
+ self.vectorizer.fit(corpus)
+
+ # Calculate cosine similarity between reference and current blurbs
+ # This is a list of the similarity between that query job and all the
+ # TODO: impl. in a more efficient way since fit() does the transform too
+ similarities_per_query = cosine_similarity(
+ self.vectorizer.transform(query_words),
+ self.vectorizer.transform(reference_words)
+ if existing_jobs_dict else None,
+ )
+
+ # Find Duplicate jobs by similarity score
+ # NOTE: multiple jobs can be determined to be a duplicate of same job!
+ # TODO: traverse this so we look at max similarity for original vs query
+ # currently it's the other way around so we can look at multi-matching
+ # original jobs but not multiple matching queries for our original job.
+ new_duplicate_jobs_list = [] # type: List[DuplicatedJob]
+ for query_similarities, query_id in zip(similarities_per_query,
+ query_ids):
+
+ # Identify the jobs in existing_jobs_dict that our query is a
+ # duplicate of
+ # TODO: handle if everything is highly similar!
+ similar_indeces = np.where(
+ query_similarities >= self.max_similarity
+ )[0]
+ if similar_indeces.size > 0:
+ # TODO: capture if more jobs are similar by content match
+ top_similar_job = np.argmax(query_similarities[similar_indeces])
+ self.logger.debug(
+ f"Identified incoming job {query_id} as new duplicate by "
+ "contents of existing job "
+ f"{reference_ids[top_similar_job]}"
+ )
+ new_duplicate_jobs_list.append(
+ DuplicatedJob(
+ original=filt_existing_jobs_dict[
+ reference_ids[top_similar_job]],
+ duplicate=filt_incoming_jobs_dict[query_id],
+ type=DuplicateType.NEW_TFIDF,
+ )
+ )
+
+ if not new_duplicate_jobs_list:
+ self.logger.debug("Found no duplicates by content-matching.")
+
+ # returns a list of newly-detected duplicate Jobs
+ return new_duplicate_jobs_list
diff --git a/jobfunnel/backend/tools/tools.py b/jobfunnel/backend/tools/tools.py
new file mode 100644
index 00000000..dd850ac6
--- /dev/null
+++ b/jobfunnel/backend/tools/tools.py
@@ -0,0 +1,154 @@
+"""Assorted tools for all aspects of funnelin' that don't fit elsewhere
+"""
+import logging
+import re
+import sys
+from datetime import date, datetime, timedelta
+from typing import Optional
+
+from dateutil.relativedelta import relativedelta
+from selenium import webdriver
+from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.firefox import GeckoDriverManager
+from webdriver_manager.microsoft import (EdgeChromiumDriverManager,
+ IEDriverManager)
+from webdriver_manager.opera import OperaDriverManager
+
+from jobfunnel.backend import Job
+
+# Initialize list and store regex objects of date quantifiers
+HOUR_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?(?:hour|hr)')
+DAY_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?(?:day|d)')
+MONTH_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?month')
+YEAR_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?year')
+RECENT_REGEX_A = re.compile(r'[tT]oday|[jJ]ust [pP]osted')
+RECENT_REGEX_B = re.compile(r'[yY]esterday')
+
+
+def get_logger(logger_name: str, level: int, file_path: str,
+ message_format: str) -> logging.Logger:
+ """Initialize and return a logger
+ NOTE: you can use this as a method to add logging to any function, but if
+ you want to use this within a class, just inherit Logger class.
+ TODO: make more easily configurable w/ defaults
+ TODO: streamline
+ """
+ logger = logging.getLogger(logger_name)
+ logger.setLevel(level)
+ formatter = logging.Formatter(message_format)
+ stdout_handler = logging.StreamHandler(sys.stdout)
+ stdout_handler.setFormatter(formatter)
+ logger.addHandler(stdout_handler)
+ file_handler = logging.FileHandler(file_path)
+ file_handler.setFormatter(formatter)
+ logger.addHandler(file_handler)
+ return logger
+
+
+class Logger:
+ """Class that adds a self.logger attribute for stdio and fileio"""
+
+ def __init__(self, level: int, file_path: Optional[str] = None,
+ logger_name: Optional[str] = None,
+ message_format: Optional[str] = None) -> None:
+ """Add a logger to any class
+
+ Args:
+ level (int): logging level, which ought to be an Enum but isn't
+ file_path (Optional[str], optional): file path to log messages to.
+ NOTE: this logs at the specified log level.
+ logger_name (Optional[str], optional): base name for the logger,
+ should be unique. Defaults to inherited class name.
+ message_format (Optional[str], optional): the formatting of the
+ message to log. Defaults to a complete message with all info.
+ """
+ logger_name = logger_name or self.__class__.__name__
+ message_format = message_format or (
+ f"[%(asctime)s] [%(levelname)s] {logger_name}: %(message)s"
+ )
+ self.logger = get_logger(
+ logger_name=logger_name,
+ level=level,
+ file_path=file_path,
+ message_format=message_format,
+ )
+
+
+def calc_post_date_from_relative_str(date_str: str) -> date:
+ """Identifies a job's post date via post age, updates in-place
+ NOTE: we round to nearest day only so that comparisons dont capture
+ portions of days.
+ """
+ post_date = datetime.now() # type: date
+ # Supports almost all formats like 7 hours|days and 7 hr|d|+d
+ try:
+ # Hours old
+ hours_ago = HOUR_REGEX.findall(date_str)[0]
+ post_date -= timedelta(hours=int(hours_ago))
+ except IndexError:
+ # Days old
+ try:
+ days_ago = DAY_REGEX.findall(date_str)[0]
+ post_date -= timedelta(days=int(days_ago))
+ except IndexError:
+ # Months old
+ try:
+ months_ago = MONTH_REGEX.findall(date_str)[0]
+ post_date -= relativedelta(months=int(months_ago))
+ except IndexError:
+ # Years old
+ try:
+ years_ago = YEAR_REGEX.findall(date_str)[0]
+ post_date -= relativedelta(years=int(years_ago))
+ except IndexError:
+ # Try phrases like 'today'/'just posted'/'yesterday'
+ if RECENT_REGEX_A.findall(date_str) and not post_date:
+ # Today
+ post_date = datetime.now()
+ elif RECENT_REGEX_B.findall(date_str):
+ # Yesterday
+ post_date -= timedelta(days=int(1))
+ elif not post_date:
+ # We have failed to correctly evaluate date.
+ raise ValueError(
+ f"Unable to calculate date from:\n{date_str}"
+ )
+
+ return post_date.replace(hour=0, minute=0, second=0, microsecond=0)
+
+
+def get_webdriver():
+ """Get whatever webdriver is availiable in the system.
+ webdriver_manager and selenium are currently being used for this.
+ Supported: Firefox, Chrome, Opera, Microsoft Edge, Internet Explorer
+ Returns:
+ webdriver that can be used for scraping.
+ Returns None if we don't find a supported webdriver.
+ """
+ try:
+ driver = webdriver.Firefox(
+ executable_path=GeckoDriverManager().install()
+ )
+ except Exception:
+ try:
+ driver = webdriver.Chrome(ChromeDriverManager().install())
+ except Exception:
+ try:
+ driver = webdriver.Ie(IEDriverManager().install())
+ except Exception:
+ try:
+ driver = webdriver.Opera(
+ executable_path=OperaDriverManager().install()
+ )
+ except Exception:
+ try:
+ driver = webdriver.Edge(
+ EdgeChromiumDriverManager().install()
+ )
+ except Exception:
+ raise RuntimeError(
+ "Your browser is not supported. Must have one of "
+ "the following installed to scrape: [Firefox, "
+ "Chrome, Opera, Microsoft Edge, Internet Explorer]"
+ )
+ return driver
diff --git a/jobfunnel/config/__init__.py b/jobfunnel/config/__init__.py
index e69de29b..e33f17b6 100644
--- a/jobfunnel/config/__init__.py
+++ b/jobfunnel/config/__init__.py
@@ -0,0 +1,9 @@
+from jobfunnel.config.settings import SettingsValidator, SETTINGS_YAML_SCHEMA
+from jobfunnel.config.base import BaseConfig
+from jobfunnel.config.delay import DelayConfig
+from jobfunnel.config.proxy import ProxyConfig
+from jobfunnel.config.search import SearchConfig
+from jobfunnel.config.manager import JobFunnelConfigManager
+from jobfunnel.config.cli import (
+ parse_cli, get_config_manager, build_config_dict
+)
diff --git a/jobfunnel/config/base.py b/jobfunnel/config/base.py
new file mode 100644
index 00000000..19561bf2
--- /dev/null
+++ b/jobfunnel/config/base.py
@@ -0,0 +1,18 @@
+"""Base config object with a validator
+"""
+from abc import ABC, abstractmethod
+
+
+class BaseConfig(ABC):
+ """Base config object
+ """
+
+ @abstractmethod
+ def __init__(self) -> None:
+ pass
+
+ def validate(self) -> None:
+ """This should raise Exceptions if self.attribs are bad
+ NOTE: if we use sub-configs we could potentially use Cerberus for this
+ against any vars(Config)
+ """
diff --git a/jobfunnel/config/cli.py b/jobfunnel/config/cli.py
new file mode 100644
index 00000000..268b1303
--- /dev/null
+++ b/jobfunnel/config/cli.py
@@ -0,0 +1,378 @@
+"""Configuration parsing module for CLI --> JobFunnelConfigManager
+"""
+import argparse
+from typing import Dict, Any, List
+import yaml
+
+from jobfunnel.config import (DelayConfig, JobFunnelConfigManager,
+ ProxyConfig, SearchConfig, SettingsValidator)
+from jobfunnel.resources import (LOG_LEVEL_NAMES, DelayAlgorithm, Locale,
+ Provider)
+from jobfunnel.resources.defaults import *
+
+
+def parse_cli(args: List[str]) -> Dict[str, Any]:
+ """Parse the command line arguments into an Dict[arg_name, arg_value]
+
+ TODO: need to ensure users can try out JobFunnel as easily as possible.
+ """
+ base_parser = argparse.ArgumentParser('Job Search CLI.')
+
+ # Independant arguments
+ base_parser.add_argument(
+ '--recover',
+ dest='do_recovery_mode',
+ action='store_true',
+ help='Reconstruct a new master CSV file from all available cache files.'
+ 'WARNING: this will replace all the statuses/etc in your master '
+ 'CSV, it is intended for starting fresh / recovering from a bad '
+ 'state.',
+ )
+
+ base_subparsers = base_parser.add_subparsers(required=False)
+
+ # Configure everything via a YAML (NOTE: no other parsers may be passed)
+ yaml_parser = base_subparsers.add_parser(
+ 'load',
+ help='Run using an existing configuration YAML.',
+ )
+
+ yaml_parser.add_argument(
+ '-s',
+ dest='settings_yaml_file',
+ type=str,
+ help='Path to a settings YAML file containing your job search config.',
+ required=True,
+ )
+
+ yaml_parser.add_argument(
+ '--no-scrape',
+ action='store_true',
+ help='Do not make any get requests, instead, load jobs from cache '
+ 'and update filters + CSV file. NOTE: overrides setting in YAML.',
+ )
+
+ yaml_parser.add_argument(
+ '-log-level',
+ type=str,
+ choices=LOG_LEVEL_NAMES,
+ help='Type of logging information shown on the terminal. NOTE: '
+ 'if passed, overrides the setting in YAML.',
+ required=False,
+ )
+
+ # We are using CLI for all arguments.
+ cli_parser = base_subparsers.add_parser(
+ 'inline',
+ help='Configure search query and data providers via CLI.',
+ )
+
+ cli_parser.add_argument(
+ '-log-level',
+ type=str,
+ choices=LOG_LEVEL_NAMES,
+ default=DEFAULT_LOG_LEVEL_NAME,
+ help='Type of logging information shown on the terminal.',
+ )
+ cli_parser.add_argument(
+ '--no-scrape',
+ action='store_true',
+ help='Do not make any get requests, instead, load jobs from cache '
+ 'and update filters + CSV file.',
+ )
+
+ # Paths
+ search_group = cli_parser.add_argument_group('paths')
+ search_group.add_argument(
+ '-csv',
+ dest='master_csv_file',
+ type=str,
+ help='Path to a master CSV file containing your search results.',
+ required=True,
+ )
+
+ search_group.add_argument(
+ '-cache',
+ dest='cache_folder',
+ type=str,
+ help='Directory where cached scrape data will be stored.',
+ required=True,
+ )
+
+ search_group.add_argument(
+ '-blf',
+ dest='block_list_file',
+ type=str,
+ help='JSON file of jobs you want to omit from your job search.',
+ required=True,
+ )
+
+ search_group.add_argument(
+ '-dl',
+ dest='duplicates_list_file',
+ type=str,
+ help='JSON file of jobs which have been detected to be duplicates of '
+ 'existing jobs.',
+ required=True,
+ )
+
+ search_group.add_argument(
+ '-log-file',
+ type=str,
+ help='Path to log file.',
+ required=True, # TODO: This should be optional (no writing to it all).
+ )
+
+ # SearchConfig via CLI args subparser
+ search_group = cli_parser.add_argument_group('search')
+ search_group.add_argument(
+ '-kw',
+ dest='search.keywords',
+ type=str,
+ nargs='+',
+ help='List of job-search keywords (i.e. Engineer, AI).',
+ required=True,
+ )
+
+ search_group.add_argument(
+ '-l',
+ dest='search.locale',
+ type=str,
+ choices=[l.name for l in Locale],
+ help='Global location and language to use to scrape the job provider'
+ ' website (i.e. -l CANADA_ENGLISH -p indeed --> indeed.ca).',
+ required=True,
+ )
+
+ search_group.add_argument(
+ '-ps',
+ dest='search.province_or_state',
+ type=str,
+ help='Province/state value for your job-search area of interest. '
+ '(i.e. Ontario).',
+ required=True,
+ )
+
+ search_group.add_argument(
+ '-c',
+ dest='search.city',
+ type=str,
+ help='City/town value for job-search region (i.e. Waterloo).',
+ required=True,
+ )
+
+ search_group.add_argument(
+ '-cbl',
+ type=str,
+ dest='search.company_block_list',
+ nargs='+',
+ default=DEFAULT_COMPANY_BLOCK_LIST,
+ help='List of company names to omit from all search results '
+ '(i.e. SpamCompany, Cash5Gold).',
+ required=False,
+ )
+
+ search_group.add_argument(
+ '-p',
+ dest='search.providers',
+ type=str,
+ nargs='+',
+ choices=[p.name for p in Provider],
+ default=DEFAULT_PROVIDER_NAMES,
+ help='List of job-search providers (i.e. Indeed, Monster, GlassDoor).',
+ required=False,
+ )
+
+ search_group.add_argument(
+ '-r',
+ dest='search.radius',
+ type=int,
+ default=DEFAULT_SEARCH_RADIUS,
+ help='The maximum distance a job should be from the specified city. '
+ 'NOTE: units are [km] CANADA locales and [mi] for US locales.',
+ required=False,
+ )
+
+ search_group.add_argument(
+ '-max-listing-days',
+ dest='search.max_listing_days',
+ type=int,
+ default=DEFAULT_MAX_LISTING_DAYS,
+ help='The maximum number of days-old a job can be. (i.e pass 30 to '
+ 'filter out jobs older than a month).',
+ required=False,
+ )
+
+ search_group.add_argument(
+ '--similar-results',
+ dest='search.similar_results',
+ action='store_true',
+ help='Return more general results from search query '
+ '(NOTE: this is only available for Indeed provider).',
+ )
+
+ # Proxy stuff. TODO: way to tell argparse if proxy is seen all are req'd?
+ proxy_group = cli_parser.add_argument_group('proxy')
+ proxy_group.add_argument(
+ '-protocol',
+ dest='proxy.protocol',
+ type=str,
+ help='Proxy protocol.',
+ )
+ proxy_group.add_argument(
+ '-ip',
+ dest='proxy.ip',
+ type=str,
+ help='Proxy IP (V4) address.',
+ )
+ proxy_group.add_argument(
+ '-port',
+ dest='proxy.port',
+ type=str,
+ help='Proxy port address.',
+ )
+
+ # Delay stuff
+ delay_group = cli_parser.add_argument_group('delay')
+ delay_group.add_argument(
+ '--random',
+ dest='delay.random',
+ action='store_true',
+ help='Turn on random delaying.',
+ )
+
+ delay_group.add_argument(
+ '--converging',
+ dest='delay.converging',
+ action='store_true',
+ help='Use converging random delay. NOTE: this is intended to be used '
+ 'with --random',
+ )
+
+ delay_group.add_argument(
+ '-max',
+ dest='delay.max_duration',
+ type=float,
+ default=DEFAULT_DELAY_MAX_DURATION,
+ help='Set the maximum delay duration in seconds.',
+ )
+
+ delay_group.add_argument(
+ '-min',
+ dest='delay.min_duration',
+ type=float,
+ default=DEFAULT_DELAY_MIN_DURATION,
+ help='Set the minimum delay duration in seconds',
+ )
+
+ delay_group.add_argument(
+ '-algorithm',
+ dest='delay.algorithm',
+ choices=[a.name for a in DelayAlgorithm],
+ default=DEFAULT_DELAY_ALGORITHM.name,
+ help='Select a function to calculate delay times with.',
+ )
+ return vars(base_parser.parse_args(args))
+
+
+def build_config_dict(args_dict: Dict[str, Any]) -> Dict[str, Any]:
+ """Parse the JobFunnel configuration settings and combine CLI, YAML and
+ defaults to build a valid config dictionary for initializing config objects.
+ """
+ # Build a config that respects CLI, defaults and YAML
+ # NOTE: we a passed settings YAML first so we can inject CLI after if needed
+ if 'settings_yaml_file' in args_dict:
+
+ # Load YAML
+ config = yaml.load(
+ open(args_dict['settings_yaml_file'], 'r'),
+ Loader=yaml.FullLoader,
+ )
+
+ # Inject any base level args (--no-scrape, -log-level)
+ config['no_scrape'] = args_dict['no_scrape']
+ if args_dict.get('log_level'):
+ config['log_level'] = args_dict['log_level']
+
+ # Set defaults for our YAML
+ config = SettingsValidator.normalized(config)
+
+ # Validate the config passed via YAML
+ if not SettingsValidator.validate(config):
+ raise ValueError(
+ f"Invalid Config settings yaml:\n{SettingsValidator.errors}"
+ )
+
+ else:
+
+ # Handle CLI arguments for paths, possibly overwriting YAML
+ sub_keys = ['search', 'delay', 'proxy']
+ config = {k: {} for k in sub_keys} # type: Dict[str, Dict[str, Any]]
+
+ # Handle all the sub-configs, and non-path, non-default CLI args
+ for key, value in args_dict.items():
+ if key == 'do_recovery_mode':
+ # This is not present in the schema, it is CLI only.
+ continue
+ elif value is not None:
+ if any([sub_key in key for sub_key in sub_keys]):
+ # Set sub-config value
+ key_sub_strings = key.split('.')
+ assert len(key_sub_strings) == 2, "Bad dest name: " + key
+ config[key_sub_strings[0]][key_sub_strings[1]] = value
+ else:
+ # Set base-config value
+ assert '.' not in key, "Bad base-key: " + key
+ config[key] = value
+
+ return config
+
+
+def get_config_manager(config: Dict[str, Any]) -> JobFunnelConfigManager:
+ """Method to build JobFunnelConfigManager from a config dictionary
+ """
+
+ # Build JobFunnelConfigManager
+ search_cfg = SearchConfig(
+ keywords=config['search']['keywords'],
+ province_or_state=config['search']['province_or_state'],
+ city=config['search']['city'],
+ distance_radius=config['search']['radius'],
+ return_similar_results=config['search']['similar_results'],
+ max_listing_days=config['search']['max_listing_days'],
+ blocked_company_names=config['search']['company_block_list'],
+ locale=Locale[config['search']['locale']],
+ providers=[Provider[p] for p in config['search']['providers']],
+ )
+
+ delay_cfg = DelayConfig(
+ max_duration=config['delay']['max_duration'],
+ min_duration=config['delay']['min_duration'],
+ algorithm=DelayAlgorithm[config['delay']['algorithm']],
+ random=config['delay']['random'],
+ converge=config['delay']['converging'],
+ )
+
+ if config.get('proxy'):
+ proxy_cfg = ProxyConfig(
+ protocol=config['proxy']['protocol'],
+ ip_address=config['proxy']['ip'],
+ port=config['proxy']['port'],
+ )
+ else:
+ proxy_cfg = None
+
+ funnel_cfg_mgr = JobFunnelConfigManager(
+ master_csv_file=config['master_csv_file'],
+ user_block_list_file=config['block_list_file'],
+ duplicates_list_file=config['duplicates_list_file'],
+ cache_folder=config['cache_folder'],
+ log_file=config['log_file'],
+ log_level=config['log_level'],
+ no_scrape=config['no_scrape'],
+ search_config=search_cfg,
+ delay_config=delay_cfg,
+ proxy_config=proxy_cfg,
+ )
+
+ return funnel_cfg_mgr
diff --git a/jobfunnel/config/delay.py b/jobfunnel/config/delay.py
new file mode 100644
index 00000000..ec631098
--- /dev/null
+++ b/jobfunnel/config/delay.py
@@ -0,0 +1,56 @@
+"""Simple config object to contain the delay configuration
+"""
+from jobfunnel.config.base import BaseConfig
+from jobfunnel.resources import DelayAlgorithm
+from jobfunnel.resources.defaults import (DEFAULT_DELAY_ALGORITHM,
+ DEFAULT_DELAY_MAX_DURATION,
+ DEFAULT_DELAY_MIN_DURATION,
+ DEFAULT_RANDOM_CONVERGING_DELAY,
+ DEFAULT_RANDOM_DELAY)
+
+
+class DelayConfig(BaseConfig):
+ """Simple config object to contain the delay configuration
+ """
+ def __init__(self, max_duration: float = DEFAULT_DELAY_MAX_DURATION,
+ min_duration: float = DEFAULT_DELAY_MIN_DURATION,
+ algorithm: DelayAlgorithm = DEFAULT_DELAY_ALGORITHM,
+ random: bool = DEFAULT_RANDOM_DELAY,
+ converge: bool = DEFAULT_RANDOM_CONVERGING_DELAY):
+ """Delaying Configuration for GET requests
+
+ Args:
+ max_duration (float, optional): max duration.
+ Defaults to DEFAULT_DELAY_MAX_DURATION.
+ min_duration (float, optional): min duration.
+ Defaults to DEFAULT_DELAY_MIN_DURATION.
+ algorithm (DelayAlgorithm, optional): algorithm.
+ Defaults to DEFAULT_DELAY_ALGORITHM.
+ random (bool, optional): [enable random delaying.
+ Defaults to DEFAULT_RANDOM_DELAY.
+ converge (bool, optional): enable random converging delaying.
+ Defaults to DEFAULT_RANDOM_CONVERGING_DELAY.
+ """
+ super().__init__()
+ self.max_duration = max_duration
+ self.min_duration = min_duration
+ self.algorithm = algorithm
+ self.random = random
+ self.converge = converge
+
+ def validate(self) -> None:
+ if self.max_duration <= 0:
+ raise ValueError("Your max delay is set to 0 or less.")
+ if self.min_duration <= 0 or self.min_duration >= self.max_duration:
+ raise ValueError(
+ "Minimum delay is below 0, or more than or equal to delay."
+ )
+ if type(self.algorithm) != DelayAlgorithm:
+ raise ValueError(
+ f"Invalid Value for delaying algorithm: {self.algorithm}"
+ )
+ if self.converge and not self.random:
+ raise ValueError(
+ "You cannot configure convering random delay without also "
+ "enabling random delaying"
+ )
diff --git a/jobfunnel/config/manager.py b/jobfunnel/config/manager.py
new file mode 100644
index 00000000..13bcfa24
--- /dev/null
+++ b/jobfunnel/config/manager.py
@@ -0,0 +1,123 @@
+"""Config object to run JobFunnel
+"""
+import logging
+import os
+from typing import List, Optional
+
+from jobfunnel.backend.scrapers.registry import SCRAPER_FROM_LOCALE
+from jobfunnel.config import BaseConfig, DelayConfig, ProxyConfig, SearchConfig
+from jobfunnel.resources import BS4_PARSER
+
+# pylint: disable=using-constant-test,unused-import
+if False: # or typing.TYPE_CHECKING if python3.5.3+
+ from jobfunnel.backend.scrapers.base import BaseScraper
+# pylint: enable=using-constant-test,unused-import
+
+
+class JobFunnelConfigManager(BaseConfig):
+ """Master config containing all the information we need to run jobfunnel
+ """
+
+ def __init__(self,
+ master_csv_file: str,
+ user_block_list_file: str,
+ duplicates_list_file: str,
+ cache_folder: str,
+ search_config: SearchConfig,
+ log_file: str,
+ log_level: Optional[int] = logging.INFO,
+ no_scrape: Optional[bool] = False,
+ bs4_parser: Optional[str] = BS4_PARSER,
+ return_similar_results: Optional[bool] = False,
+ delay_config: Optional[DelayConfig] = None,
+ proxy_config: Optional[ProxyConfig] = None) -> None:
+ """Init a config that determines how we will scrape jobs from Scrapers
+ and how we will update CSV and filtering lists
+
+ TODO: we might want to make a RunTimeConfig with the flags etc.
+
+ Args:
+ master_csv_file (str): path to the .csv file that user interacts w/
+ user_block_list_file (str): path to a JSON that contains jobs user
+ has decided to omit from their .csv file (i.e. archive status)
+ duplicates_list_file (str): path to a JSON that contains jobs
+ which TFIDF has identified to be duplicates of an existing job
+ cache_folder (str): folder where all scrape data will be stored
+ search_config (SearchConfig): SearchTerms config which contains the
+ desired job search information (i.e. keywords)
+ log_file (str): file to log all logger calls to
+ log_level (int): level to log at, use 10 logging.DEBUG for more data
+ no_scrape (Optional[bool], optional): If True, will not scrape data
+ at all, instead will only update filters and CSV. Defaults to
+ False.
+ bs4_parser (Optional[str], optional): the parser to use for BS4.
+ return_similar_resuts (Optional[bool], optional): If True, we will
+ ask the job provider to provide more loosely-similar results for
+ our search queries. NOTE: only a thing for indeed rn.
+ delay_config (Optional[DelayConfig], optional): delay config object.
+ Defaults to a default delay config object.
+ proxy_config (Optional[ProxyConfig], optional): proxy config object.
+ Defaults to None, which will result in no proxy being used
+ """
+ super().__init__()
+ self.master_csv_file = master_csv_file
+ self.user_block_list_file = user_block_list_file
+ self.duplicates_list_file = duplicates_list_file
+ self.cache_folder = cache_folder
+ self.search_config = search_config
+ self.log_file = log_file
+ self.log_level = log_level
+ self.no_scrape = no_scrape
+ self.bs4_parser = bs4_parser # NOTE: this is not currently configurable
+ self.return_similar_results = return_similar_results
+ if not delay_config:
+ # We will always use a delay config to be respectful
+ self.delay_config = DelayConfig()
+ else:
+ self.delay_config = delay_config
+ self.proxy_config = proxy_config
+
+ @property
+ def scrapers(self) -> List['BaseScraper']:
+ """All the compatible scrapers for the provider_name
+ """
+ scrapers = [] # type: List[BaseScraper]
+ for pr in self.search_config.providers:
+ if pr in SCRAPER_FROM_LOCALE:
+ scrapers.append(
+ SCRAPER_FROM_LOCALE[pr][self.search_config.locale]
+ )
+ else:
+ raise ValueError(
+ f"No scraper available for unknown provider {pr}"
+ )
+ return scrapers
+
+ @property
+ def scraper_names(self) -> List[str]:
+ """User-readable names of the scrapers we will be running
+ """
+ return [s.__name__ for s in self.scrapers]
+
+ def create_dirs(self) -> None:
+ """Create the directories for attributes which refer to files / folders
+ NOTE: should be called before we validate()
+ """
+ for file_path in [self.master_csv_file, self.user_block_list_file,
+ self.duplicates_list_file, self.log_file]:
+ output_dir = os.path.dirname(os.path.abspath(file_path))
+ if not os.path.exists(output_dir):
+ os.makedirs(output_dir)
+ if not os.path.exists(self.cache_folder):
+ os.makedirs(self.cache_folder)
+
+ def validate(self) -> None:
+ """Validate the config object i.e. paths exit
+ NOTE: will raise exceptions if issues are encountered.
+ TODO: impl. more validation here
+ """
+ assert os.path.exists(self.cache_folder)
+ self.search_config.validate()
+ if self.proxy_config:
+ self.proxy_config.validate()
+ self.delay_config.validate()
diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py
deleted file mode 100644
index 2acfa0d1..00000000
--- a/jobfunnel/config/parser.py
+++ /dev/null
@@ -1,309 +0,0 @@
-"""Configuration parsing module.
-
-"""
-import argparse
-import logging
-import os
-import yaml
-
-from .valid_options import CONFIG_TYPES
-from ..tools.tools import split_url
-
-log_levels = {'critical': logging.CRITICAL, 'error': logging.ERROR,
- 'warning': logging.WARNING, 'info': logging.INFO,
- 'debug': logging.DEBUG, 'notset': logging.NOTSET}
-
-
-class ConfigError(ValueError):
- def __init__(self, arg):
- self.strerror = f"ConfigError: '{arg}' has an invalid value"
- self.args = {arg}
-
-
-def parse_cli():
- """ Parse the command line arguments.
-
- """
- parser = argparse.ArgumentParser(
- 'CLI options take precedence over settings in the yaml file'
- 'empty arguments are replaced by settings in the default yaml file')
-
- parser.add_argument('-s',
- dest='settings',
- type=str,
- required=False,
- help='path to the yaml settings file')
-
- parser.add_argument('-o',
- dest='output_path',
- action='store',
- required=False,
- help='directory where the search results will be '
- 'stored')
-
- parser.add_argument('-kw',
- dest='keywords',
- nargs='*',
- required=False,
- help='list of keywords to use in the job search. ('
- 'i.e. Engineer, AI)')
-
- parser.add_argument('-p',
- dest='province',
- type=str,
- required=False,
- help='province value for a region ')
-
- parser.add_argument('--city',
- dest='city',
- type=str,
- required=False,
- help='city value for a region ')
-
- parser.add_argument('--domain',
- dest='domain',
- type=str,
- required=False,
- help='domain value for a region ')
-
- parser.add_argument('-r',
- dest='random',
- action='store_true',
- required=False,
- default=None,
- help='turn on random delaying')
-
- parser.add_argument('-c',
- dest='converge',
- action='store_true',
- required=False,
- default=None,
- help='use converging random delay')
-
- parser.add_argument('-d',
- dest='delay',
- type=float,
- required=False,
- help='set delay seconds for scrapes.')
-
- parser.add_argument('-md',
- dest='min_delay',
- type=float,
- required=False,
- help='set lower bound value for scraper')
-
- parser.add_argument('--fun',
- dest='function',
- type=str,
- required=False,
- default=None,
- choices=['constant', 'linear', 'sigmoid'],
- help='Select a function to calculate delay times with')
-
- parser.add_argument('--log_level',
- dest='log_level',
- type=str,
- required=False,
- default=None,
- choices=['critical', 'error', 'warning', 'info',
- 'debug', 'notset'],
- help='Type of logging information shown on the '
- 'terminal.')
-
- parser.add_argument('--similar',
- dest='similar',
- action='store_true',
- default=None,
- help='pass to get \'similar\' job listings')
-
- parser.add_argument('--no_scrape',
- dest='no_scrape',
- action='store_true',
- default=None,
- help='skip web-scraping and load a previously saved '
- 'daily scrape pickle')
-
- parser.add_argument('--proxy',
- dest='proxy',
- type=str,
- required=False,
- default=None,
- help='proxy address')
-
- parser.add_argument('--recover',
- dest='recover',
- action='store_true',
- default=None,
- help='recover master-list by accessing all historic '
- 'scrapes pickles')
-
- parser.add_argument('--save_dup',
- dest='save_duplicates',
- action='store_true',
- required=False,
- default=None,
- help='save duplicates popped by tf_idf filter to file')
- parser.add_argument('--max_listing_days',
- dest='max_listing_days',
- type=int,
- default=None,
- required=False,
- help='The maximum number of days old a job can be.'
- '(i.e pass 30 to filter out jobs older than a month)')
-
- return parser.parse_args()
-
-
-def cli_to_yaml(cli):
- """ Put program arguments into dictionary in same style as configuration
- yaml.
-
- """
- yaml = {
- 'output_path': cli.output_path,
- 'search_terms': {
- 'region': {
- 'province': cli.province,
- 'city': cli.city,
- 'domain': cli.domain
- },
- 'keywords': cli.keywords
- },
- 'log_level': cli.log_level,
- 'similar': cli.similar,
- 'no_scrape': cli.no_scrape,
- 'recover': cli.recover,
- 'save_duplicates': cli.save_duplicates,
- 'delay_config': {
- 'function': cli.function,
- 'delay': cli.delay,
- 'min_delay': cli.min_delay,
- 'random': cli.random,
- 'converge': cli.converge
- },
- 'max_listing_days': cli.max_listing_days,
- }
-
- if cli.proxy is not None:
- yaml['proxy'] = split_url(cli.proxy)
- return yaml
-
-
-def update_yaml(config, new_yaml):
- """ Update fields of current yaml with new yaml.
-
- """
- for k, v in new_yaml.items():
- # if v is a dict we need to dive deeper...
- if type(v) is dict:
- # There might be times where this dictionary is not in config,
- # but it still is a valid option inside of CONFIG_TYPES
- # such as it is in the case of proxy
- if k not in config:
- config[k] = v
-
- update_yaml(config[k], v)
- else:
- if v is not None:
- config[k] = v
-
-
-def recursive_check_config_types(config, types):
- """ Recursively check type of setting vars.
-
- """
- for k, v in config.items():
- # if type is dict than we have to recursively handle this
- if type(v) is dict:
- yield from recursive_check_config_types(v, types[k])
- else:
- yield (k, type(v) in types[k])
-
-
-def check_config_types(config):
- """ Check if no settings have a wrong type and if we do not have unsupported
- options.
-
- """
- # Get a dictionary of all types and boolean if it's the right type
- types_check = recursive_check_config_types(config, CONFIG_TYPES)
-
- # Select all wrong types and throw error when there is such a value
-
- wrong_types = [k for k, v in types_check if v is False]
- if len(wrong_types) > 0:
- raise ConfigError(', '.join(wrong_types))
-
-
-def parse_config():
- """ Parse the JobFunnel configuration settings.
-
- """
- # find the jobfunnel root dir
- jobfunnel_path = os.path.normpath(
- os.path.join(os.path.dirname(__file__), '..'))
-
- # load the default settings
- default_yaml_path = os.path.join(jobfunnel_path, 'config/settings.yaml')
- default_yaml = yaml.safe_load(open(default_yaml_path, 'r'))
-
- # parse the command line arguments
- cli = parse_cli()
- cli_yaml = cli_to_yaml(cli)
-
- # parse the settings file for the line arguments
- given_yaml = None
- given_yaml_path = None
- if cli.settings is not None:
- given_yaml_path = os.path.dirname(cli.settings)
- given_yaml = yaml.safe_load(open(cli.settings, 'r'))
-
- # combine default, given and argument yamls into one. Note that we update
- # the values of the default_yaml, so we use this for the rest of the file.
- # We could make a deep copy if necessary.
- config = default_yaml
- if given_yaml is not None:
- update_yaml(config, given_yaml)
- update_yaml(config, cli_yaml)
- # check if the config has valid attribute types
- check_config_types(config)
-
- # create output path and corresponding (children) data paths
- # I feel like this is not in line with the rest of the file's philosophy
- if cli.output_path is not None:
- output_path = cli.output_path
- elif given_yaml_path is not None:
- output_path = os.path.join(given_yaml_path, given_yaml['output_path'])
- else:
- output_path = default_yaml['output_path']
-
- # define paths and normalise
- config['data_path'] = os.path.join(output_path, 'data')
- config['master_list_path'] = os.path.join(output_path, 'master_list.csv')
- config['duplicate_list_path'] = os.path.join(
- output_path, 'duplicate_list.csv')
- config['filter_list_path'] = os.path.join(
- config['data_path'], 'filter_list.json')
- config['log_path'] = os.path.join(config['data_path'], 'jobfunnel.log')
-
- # normalize paths
- for p in ['data_path', 'master_list_path', 'duplicate_list_path',
- 'log_path', 'filter_list_path']:
- config[p] = os.path.normpath(config[p])
-
- # lower provider and delay function
- for i, p in enumerate(config['providers']):
- config['providers'][i] = p.lower()
- config['delay_config']['function'] = \
- config['delay_config']['function'].lower()
-
- # parse the log level
- config['log_level'] = log_levels[config['log_level']]
-
- # check if proxy and max_listing_days have not been set yet (optional)
- if 'proxy' not in config:
- config['proxy'] = None
- if 'max_listing_days' not in config:
- config['max_listing_days'] = None
-
- return config
diff --git a/jobfunnel/config/proxy.py b/jobfunnel/config/proxy.py
new file mode 100644
index 00000000..c3e23ac7
--- /dev/null
+++ b/jobfunnel/config/proxy.py
@@ -0,0 +1,33 @@
+"""Proxy configuration for Session()
+"""
+import ipaddress
+
+from jobfunnel.config import BaseConfig
+
+
+class ProxyConfig(BaseConfig):
+ """Simple config object to contain proxy configuration
+ """
+
+ def __init__(self, protocol: str, ip_address: str, port: int) -> None:
+ super().__init__()
+ self.protocol = protocol
+ self.ip_address = ip_address
+ self.port = port
+
+ @property
+ def url(self) -> str:
+ """Get the url string for use in a Session.proxies object
+ """
+ return f"{self.protocol}://{self.ip_address}:{self.port}"
+
+ def validate(self) -> None:
+ """Validate the format of ip addr and port
+ """
+ try:
+ # try to create an IPv4 address
+ ipaddress.IPv4Address(self.ip_address)
+ except:
+ raise ValueError(f"{self.ip_address} is not a valid IPv4 address")
+ assert isinstance(self.port, int), "Port must be an integer"
+ assert self.protocol, "Protocol is not set"
diff --git a/jobfunnel/config/search.py b/jobfunnel/config/search.py
new file mode 100644
index 00000000..c9a60bc0
--- /dev/null
+++ b/jobfunnel/config/search.py
@@ -0,0 +1,84 @@
+"""Object to contain job query metadata
+"""
+from typing import List, Optional
+from jobfunnel.config import BaseConfig
+from jobfunnel.resources import Locale, Provider
+from jobfunnel.resources.defaults import (
+ DEFAULT_SEARCH_RADIUS, DEFAULT_MAX_LISTING_DAYS,
+ DEFAULT_DOMAIN_FROM_LOCALE,
+)
+
+class SearchConfig(BaseConfig):
+ """Config object containing our desired job search information including
+ the Locale of the searcher, the region to search and what job providers to
+ search with.
+ """
+
+ def __init__(self,
+ keywords: List[str],
+ province_or_state: Optional[str],
+ locale: Locale,
+ providers: List[Provider],
+ city: Optional[str] = None,
+ distance_radius: Optional[int] = None,
+ return_similar_results: bool = False,
+ max_listing_days: Optional[int] = None,
+ blocked_company_names: Optional[List[str]] = None,
+ domain: Optional[str] = None,
+ remote: bool = False,):
+ """Search config for all job sources
+
+ Args:
+ keywords (List[str]): list of search keywords
+ province_or_state (str): province or state.
+ locale(Locale): the searcher's Locale, defines the job website
+ domain and the scrapers we will use to scrape it.
+ city (Optional[str], optional): city. Defaults to None.
+ distance_radius (Optional[int], optional): km/m radius. Defaults to
+ DEFAULT_SEARCH_RADIUS.
+ return_similar_results (Optional[bool], optional): return similar.
+ results (indeed), Defaults to False.
+ max_listing_days (Optional[int], optional): oldest listing to show.
+ Defaults to DEFAULT_MAX_LISTING_DAYS.
+ blocked_company_names (Optional[List[str]]): list of names of
+ companies that we never want to see in our results.
+ domain (Optional[str], optional): domain string to use for search
+ querying. If not passed, will set based on locale. (i.e. 'ca')
+ remote: True if searching for remote jobs only TODO: impl. for scr.
+ """
+ super().__init__()
+ self.province_or_state = province_or_state
+ self.city = city.lower() if city else None
+ self.radius = distance_radius or DEFAULT_SEARCH_RADIUS
+ self.locale = locale
+ self.providers = providers
+ self.keywords = keywords
+ self.return_similar_results = return_similar_results # Indeed.X thing
+ self.max_listing_days = max_listing_days or DEFAULT_MAX_LISTING_DAYS
+ self.blocked_company_names = blocked_company_names
+ self.remote = remote
+
+ # Try to infer the domain string based on the locale.
+ if not domain:
+ if not self.locale in DEFAULT_DOMAIN_FROM_LOCALE:
+ raise ValueError(f"Unknown domain for locale: {self.locale}")
+ self.domain = DEFAULT_DOMAIN_FROM_LOCALE[self.locale]
+ else:
+ self.domain = domain
+
+ @property
+ def query_string(self) -> str:
+ """User-readable version of the keywords we are searching with for CSV
+ """
+ return ' '.join(self.keywords)
+
+ def validate(self):
+ """We need to have the right information set, not mixing stuff
+ """
+ assert self.province_or_state, "Province/State not set"
+ assert self.city, "City not set"
+ assert self.locale, "Locale not set"
+ assert self.providers and len(self.providers) >= 1, "Providers not set"
+ assert self.keywords and len(self.keywords) >= 1, "Keywords not set"
+ assert self.max_listing_days >= 1, "Cannot set max posting days < 1"
+ assert self.domain, "Domain not set"
diff --git a/jobfunnel/config/settings.py b/jobfunnel/config/settings.py
new file mode 100644
index 00000000..02e8dbe0
--- /dev/null
+++ b/jobfunnel/config/settings.py
@@ -0,0 +1,161 @@
+"""Settings YAML Schema w/ validator
+"""
+import ipaddress
+
+from cerberus import Validator
+
+from jobfunnel.resources import (LOG_LEVEL_NAMES, DelayAlgorithm, Locale,
+ Provider)
+from jobfunnel.resources.defaults import *
+
+SETTINGS_YAML_SCHEMA = {
+ 'master_csv_file': {
+ 'required': True,
+ 'type': 'string',
+ },
+ 'block_list_file': {
+ 'required': True,
+ 'type': 'string',
+ },
+ 'cache_folder': {
+ 'required': True,
+ 'type': 'string',
+ },
+ 'duplicates_list_file': {
+ 'required': True,
+ 'type': 'string',
+ },
+ 'no_scrape': {
+ 'required': False,
+ 'type': 'boolean',
+ 'default': DEFAULT_NO_SCRAPE,
+ },
+ 'log_level': {
+ 'required': False,
+ 'allowed': LOG_LEVEL_NAMES,
+ 'default': DEFAULT_LOG_LEVEL_NAME,
+ },
+ 'log_file': {
+ 'required': True, # TODO: allow this to be optional
+ 'type': 'string',
+ 'default': DEFAULT_LOG_FILE,
+ },
+ 'search': {
+ 'type': 'dict',
+ 'required': True,
+ 'schema': {
+ 'providers': {
+ 'required': False,
+ 'allowed': [p.name for p in Provider],
+ 'default': DEFAULT_PROVIDERS,
+ },
+ 'locale' : {
+ 'required': True,
+ 'allowed': [l.name for l in Locale],
+ },
+ 'province_or_state': {'required': True, 'type': 'string'},
+ 'city': {'required': True, 'type': 'string'},
+ 'radius': {
+ 'required': False,
+ 'type': 'integer',
+ 'min': 0,
+ 'default': DEFAULT_SEARCH_RADIUS,
+ },
+ 'similar_results': {
+ 'required': False,
+ 'type': 'boolean',
+ 'default': DEFAULT_RETURN_SIMILAR_RESULTS,
+ },
+ 'keywords': {
+ 'required': True,
+ 'type': 'list',
+ 'schema': {'type': 'string'},
+ },
+ 'max_listing_days': {
+ 'required': False,
+ 'type': 'integer',
+ 'min': 0,
+ 'default': DEFAULT_MAX_LISTING_DAYS,
+ },
+ 'company_block_list': {
+ 'required': False,
+ 'type': 'list',
+ 'schema': {'type': 'string'},
+ 'default': DEFAULT_COMPANY_BLOCK_LIST,
+ },
+ },
+ },
+ 'delay': {
+ 'type': 'dict',
+ 'required': False,
+ 'schema' : {
+ 'algorithm': {
+ 'required': False,
+ 'allowed': [d.name for d in DelayAlgorithm],
+ 'default': DEFAULT_DELAY_ALGORITHM.name,
+ },
+ # TODO: implement custom rule max > min
+ 'max_duration': {
+ 'required': False,
+ 'type': 'float',
+ 'min': 0,
+ 'default': DEFAULT_DELAY_MAX_DURATION,
+ },
+ 'min_duration': {
+ 'required': False,
+ 'type': 'float',
+ 'min': 0,
+ 'default': DEFAULT_DELAY_MIN_DURATION,
+ },
+ 'random': {
+ 'required': False,
+ 'type': 'boolean',
+ 'default': DEFAULT_RANDOM_DELAY,
+ },
+ 'converging': {
+ 'required': False,
+ 'type': 'boolean',
+ 'default': DEFAULT_RANDOM_CONVERGING_DELAY,
+ },
+ },
+ },
+ 'proxy': {
+ 'type': 'dict',
+ 'required': False,
+ 'schema' : {
+ 'protocol': {
+ 'required': False,
+ 'allowed': ['http', 'https'],
+ },
+ 'ip': {
+ 'required': False,
+ 'type': 'ipv4address',
+ },
+ 'port': {
+ 'required': False,
+ 'type': 'integer',
+ 'min': 0,
+ },
+ },
+ },
+}
+
+
+
+class JobFunnelSettingsValidator(Validator):
+ """A simple JSON data validator with a custom data type for IPv4 addresses
+ https://codingnetworker.com/2016/03/validate-json-data-using-cerberus/
+ """
+ def _validate_type_ipv4address(self, field, value):
+ """
+ checks that the given value is a valid IPv4 address
+ """
+ try:
+ # try to create an IPv4 address object using the python3 ipaddress
+ # module
+ ipaddress.IPv4Address(value)
+ except:
+ self._error(field, "Not a valid IPv4 address")
+
+
+SettingsValidator = JobFunnelSettingsValidator(SETTINGS_YAML_SCHEMA)
diff --git a/jobfunnel/config/settings.yaml b/jobfunnel/config/settings.yaml
deleted file mode 100644
index b58f5edc..00000000
--- a/jobfunnel/config/settings.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# This is the default settings file. Do not edit.
-
-# all paths are relative to this file
-
-# paths
-output_path: 'search'
-
-# providers from which to search (case insensitive)
-providers:
- - 'GlassDoorStatic'
- - 'Indeed'
- - 'Monster'
-
-
-
-# filters
-search_terms:
- region:
- province: 'ON'
- city: 'waterloo'
- domain: 'ca'
- radius: 25
-
- keywords:
- - 'Python'
-
-# black-listed company names
-black_list:
- - 'Infox Consulting'
-
-# logging level options are: critical, error, warning, info, debug, notset
-log_level: 'info'
-
-# keep similar job postings
-similar: False
-
-# skip web-scraping and load a previously saved daily scrape pickle
-no_scrape: False
-
-# recover master-list by accessing all historic scrapes pickles
-recover: False
-
-# saves duplicates removed by tfidf filter to duplicate_list.csv
-save_duplicates: False
-
-# delaying algorithm configuration
-delay_config:
- # functions used for delaying algorithm, options are: constant, linear, sigmoid
- function: 'linear'
- # maximum delay/upper bound for converging random delay
- delay: 10.0
- # minimum delay/lower bound for random delay
- min_delay: 1.0
- # random delay
- random: False
- # converging random delay, only used if 'random' is set to True
- converge: False
-
-# proxy settings
-# proxy:
-# # protocol (http or https)
-# protocol: 'https'
-# # ip address
-# ip_address: '1.1.1.1'
-# # port
-# port: '200'
diff --git a/jobfunnel/config/valid_options.py b/jobfunnel/config/valid_options.py
deleted file mode 100644
index b3b0c5f7..00000000
--- a/jobfunnel/config/valid_options.py
+++ /dev/null
@@ -1,38 +0,0 @@
-CONFIG_TYPES = {
- 'output_path': [str],
- 'providers': [list],
- 'search_terms': {
- 'region': {
- 'province': [str],
- 'state': [str],
- 'city': [str],
- 'domain': [str],
- 'radius': [int]
- },
- 'keywords': [list]
- },
- 'black_list': [list],
- 'log_level': [str],
- 'similar': [bool],
- 'no_scrape': [bool],
- 'recover': [bool],
- 'save_duplicates': [bool],
- 'delay_config': {
- 'function': [str],
- 'delay': [float, int],
- 'min_delay': [float, int],
- 'random': [bool],
- 'converge': [bool]
- },
- 'proxy': {
- 'protocol': [str],
- 'ip_address': [str],
- 'port': [str]
- },
- 'max_listing_days': [int],
-
-}
-
-PROVIDERS = ['glassdoordynamic', 'glassdoorstatic', 'indeed', 'monster']
-DOMAINS = ['com', 'ca']
-DELAY_FUN = ['constant', 'linear', 'sigmoid']
diff --git a/jobfunnel/config/validate.py b/jobfunnel/config/validate.py
deleted file mode 100644
index 1c40806b..00000000
--- a/jobfunnel/config/validate.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import re
-
-from .valid_options import DOMAINS, PROVIDERS, DELAY_FUN
-from .parser import ConfigError
-
-
-def validate_region(region):
- """ Check if the region settings are valid.
-
- """
- # only allow supported domains
- if region['domain'] not in DOMAINS:
- raise ConfigError('domain')
-
- # search term state is inserted as province if province does not already
- # exist
- if 'state' in region:
- if (region['state'] is not None) and (region['province'] is None):
- region['province'] = region['state']
-
- # north american jobs should have a province/state provided
- if region['domain'] in ['com', 'ca'] and region['province'] is None:
- raise ConfigError('province')
-
-
-def validate_delay(delay):
- """ Check if the delay has a valid configuration.
-
- """
- # delay function should be constant, linear or sigmoid
- if delay['function'] not in DELAY_FUN:
- raise ConfigError('delay_function')
-
- # maximum delay should be larger or equal to minimum delay
- if delay['delay'] < delay['min_delay']:
- raise ConfigError('(min)_delay')
-
- # minimum delay should be at least 1 and maximum delay at least 10
- if delay['delay'] < 10 or delay['min_delay'] < 1:
- raise ConfigError('(min)_delay')
-
-
-def validate_config(config):
- """ Check whether the config is a valid configuration.
-
- Some options are already checked at the command-line tool, e.g., loggging.
- Some checks are trivial while others have a separate function.
- """
- # check if paths are valid
- check_paths = {
- 'data_path': r'data$',
- 'master_list_path': r'master_list\.csv$',
- 'duplicate_list_path': r'duplicate_list\.csv$',
- 'log_path': r'data[\\\/]jobfunnel.log$',
- 'filter_list_path': r'data[\\\/]filter_list\.json$',
- }
-
- for path, pattern in check_paths.items():
- if not re.search(pattern, config[path]):
- raise ConfigError(path)
- # check if the provider list only consists of supported providers
- if not set(config['providers']).issubset(PROVIDERS):
- raise ConfigError('providers')
-
- # check validity of region settings
- validate_region(config['search_terms']['region'])
-
- # check validity of delay settings
- validate_delay(config['delay_config'])
-
- # check the validity of max_listing_days settings
- if(config['max_listing_days'] is not None and config['max_listing_days'] < 0):
- raise ConfigError('max_listing_days')
diff --git a/jobfunnel/glassdoor_base.py b/jobfunnel/glassdoor_base.py
deleted file mode 100644
index 6eb0bda4..00000000
--- a/jobfunnel/glassdoor_base.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import re
-
-from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, wait
-from logging import info as log_info
-from math import ceil
-from requests import post
-from time import sleep, time
-
-from .jobfunnel import JobFunnel, MASTERLIST_HEADER
-from .tools.tools import filter_non_printables
-from .tools.tools import post_date_from_relative_post_age
-
-
-class GlassDoorBase(JobFunnel):
- def __init__(self, args):
- super().__init__(args)
- self.provider = 'glassdoorbase'
- self.max_results_per_page = 30
- self.delay = 0
-
- self.location_headers = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,'
- 'image/webp,*/*;q=0.01',
- 'accept-encoding': 'gzip, deflate, sdch, br',
- 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
- 'referer': 'https://www.glassdoor.{0}/'.format(
- self.search_terms['region']['domain']
- ),
- 'upgrade-insecure-requests': '1',
- 'user-agent': self.user_agent,
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive',
- }
-
- def convert_radius(self, radius):
- """function that quantizes the user input radius to a valid radius
- value: 10, 20, 30, 50, 100, and 200 kilometers"""
- if self.search_terms['region']['domain'] == 'com':
- if radius < 5:
- radius = 0
- elif 5 <= radius < 10:
- radius = 5
- elif 10 <= radius < 15:
- radius = 10
- elif 15 <= radius < 25:
- radius = 15
- elif 25 <= radius < 50:
- radius = 25
- elif 50 <= radius < 100:
- radius = 50
- elif radius >= 100:
- radius = 100
- return radius
-
- else:
- if radius < 10:
- radius = 0
- elif 10 <= radius < 20:
- radius = 10
- elif 20 <= radius < 30:
- radius = 20
- elif 30 <= radius < 50:
- radius = 30
- elif 50 <= radius < 100:
- radius = 50
- elif 100 <= radius < 200:
- radius = 100
- elif radius >= 200:
- radius = 200
-
- glassdoor_radius = {0: 0,
- 10: 6,
- 20: 12,
- 30: 19,
- 50: 31,
- 100: 62,
- 200: 124}
-
- return glassdoor_radius[radius]
-
- def parse_blurb(self, job, html):
- """parses and stores job description into dict entry"""
- job_link_soup = BeautifulSoup(html, self.bs4_parser)
-
- try:
- job['blurb'] = job_link_soup.find(
- id='JobDescriptionContainer').text.strip()
- except AttributeError:
- job['blurb'] = ''
-
- filter_non_printables(job)
diff --git a/jobfunnel/glassdoor_dynamic.py b/jobfunnel/glassdoor_dynamic.py
deleted file mode 100644
index ffb753c1..00000000
--- a/jobfunnel/glassdoor_dynamic.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import re
-
-from bs4 import BeautifulSoup
-from selenium import webdriver
-from concurrent.futures import ThreadPoolExecutor, wait
-from logging import info as log_info
-from math import ceil
-from requests import post
-from time import sleep, time
-
-
-from .jobfunnel import JobFunnel, MASTERLIST_HEADER
-from .tools.tools import filter_non_printables
-from .tools.tools import post_date_from_relative_post_age, get_webdriver
-from .glassdoor_base import GlassDoorBase
-
-
-class GlassDoorDynamic(GlassDoorBase):
- """The Dynamic Version of the GlassDoor scraper, that uses selenium to scrape job postings."""
-
- def __init__(self, args):
- super().__init__(args)
- self.provider = 'glassdoordynamic'
-
- # Keeping old query function so this class does not break.
- self.query = '-'.join(self.search_terms['keywords'])
- # initialize the webdriver
- self.driver = get_webdriver()
-
- def get_search_url(self, method='get'):
- """gets the glassdoor search url"""
- # form the location lookup request data
- data = {'term': self.search_terms['region']
- ['city'], 'maxLocationsToReturn': 10}
-
- # form the location lookup url
- location_url = 'https://www.glassdoor.co.in/findPopularLocationAjax.htm?'
-
- # get the location id for search location
- location_response = self.s.post(
- location_url, headers=self.location_headers, data=data
- ).json()
-
- if method == 'get':
- # form job search url
- search = (
- 'https://www.glassdoor.{0}/Job/jobs.htm?'
- 'clickSource=searchBtn&sc.keyword={1}&locT=C&locId={2}&jobType=&radius={3}'.format(
- self.search_terms['region']['domain'],
- self.query,
- location_response[0]['locationId'],
- self.convert_radius(self.search_terms['region']['radius']),
- )
- )
-
- return search
- elif method == 'post':
- # form the job search url
- search = (
- f'https://www.glassdoor.'
- f"{self.search_terms['region']['domain']}/Job/jobs.htm"
- )
-
- # form the job search data
- data = {
- 'clickSource': 'searchBtn',
- 'sc.keyword': self.query,
- 'locT': 'C',
- 'locId': location_response[0]['locationId'],
- 'jobType': '',
- 'radius': self.convert_radius(self.search_terms['region']['radius']),
- }
-
- return search, data
- else:
- raise ValueError(f'No html method {method} exists')
-
- def search_page_for_job_soups(self, page, url, job_soup_list):
- """function that scrapes the glassdoor page for a list of job soups"""
- log_info(f'getting glassdoor page {page} : {url}')
-
- self.driver.get(url)
- job = BeautifulSoup(self.driver.page_source, self.bs4_parser).find_all(
- 'li', attrs={'class', 'jl'}
- )
- job_soup_list.extend(job)
-
- def search_joblink_for_blurb(self, job):
- """function that scrapes the glassdoor job link for the blurb"""
- search = job['link']
- log_info(f'getting glassdoor search: {search}')
-
- self.driver.get(search)
- job_link_soup = BeautifulSoup(self.driver.page_source, self.bs4_parser)
-
- try:
- job['blurb'] = job_link_soup.find(
- id='JobDescriptionContainer').text.strip()
- except AttributeError:
- job['blurb'] = ''
-
- filter_non_printables(job)
-
- # split apart above function into two so gotten blurbs can be parsed
- # while others blurbs are being obtained
- def get_blurb_with_delay(self, job, delay):
- """gets blurb from glassdoor job link and sets delays for requests"""
- sleep(delay)
-
- search = job['link']
- log_info(f'delay of {delay:.2f}s, getting glassdoor search: {search}')
-
- self.driver.get(search)
- res = self.driver.page_source
- return job, res
-
- def scrape(self):
- """function that scrapes job posting from glassdoor and pickles it"""
- log_info(f'jobfunnel glassdoor to pickle running @ {self.date_string}')
-
- # get the se arch url
- search = self.get_search_url()
-
- # get the html data, initialize bs4 with lxml
- self.driver.get(search)
-
- # create the soup base
- soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
- num_res = soup_base.find('p', attrs={
- 'class', 'jobsCount'})
- while(num_res is None):
- print("It looks like that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:"
- "\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue")
- # wait for user to complete CAPTCHA
- input()
- soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
- num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
- # scrape total number of results, and calculate the # pages needed
-
- num_res = num_res.text.strip()
- num_res = int(re.findall(r'(\d+)', num_res.replace(',', ''))[0])
- log_info(
- f'Found {num_res} glassdoor results for query=' f'{self.query}')
-
- pages = int(ceil(num_res / self.max_results_per_page))
-
- # init list of job soups
- job_soup_list = []
- # init threads
- threads = ThreadPoolExecutor(max_workers=1)
- # init futures list
- fts = []
-
- # search the pages to extract the list of job soups
- for page in range(1, pages + 1):
- if page == 1:
- fts.append( # append thread job future to futures list
- threads.submit(
- self.search_page_for_job_soups,
- page,
- self.driver.current_url,
- job_soup_list,
- )
- )
- else:
- # gets partial url for next page
- part_url = (
- soup_base.find('li', attrs={'class', 'next'}).find(
- 'a').get('href')
- )
- # uses partial url to construct next page url
- page_url = re.sub(
- r'.htm',
- 'IP' + str(page) + '.htm',
- f'https://www.glassdoor.'
- f"{self.search_terms['region']['domain']}"
- f'{part_url}',
- )
-
- fts.append( # append thread job future to futures list
- threads.submit(
- self.search_page_for_job_soups, page, page_url, job_soup_list
- )
- )
- wait(fts) # wait for all scrape jobs to finish
- # close and shutdown the web driver
- self.driver.close()
- # make a dict of job postings from the listing briefs
- for s in job_soup_list:
- # init dict to store scraped data
- job = dict([(k, '') for k in MASTERLIST_HEADER])
-
- # scrape the post data
- job['status'] = 'new'
- try:
- # jobs should at minimum have a title, company and location
- job['title'] = s.find_all('a', attrs={'class', 'jobTitle'})[
- 1
- ].text.strip()
- job['company'] = s.find(
- 'div', attrs={'class', 'jobEmpolyerName'}
- ).text.strip()
- job['location'] = s.find(
- 'span', attrs={'class', 'loc'}).text.strip()
- except AttributeError:
- continue
-
- # set blurb to none for now
- job['blurb'] = ''
-
- try:
- labels = s.find_all('div', attrs={'class', 'jobLabel'})
- job['tags'] = '\n'.join(
- [l.text.strip() for l in labels if l.text.strip() != 'New']
- )
- except AttributeError:
- job['tags'] = ''
-
- try:
- # dynamic way of fetching date
- job['date'] = s.find('div', attrs={
- 'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip()
- except AttributeError:
- job['date'] = ''
-
- try:
- job['id'] = s.get('data-id')
- job['link'] = (
- s.find('div', attrs={'class', 'logoWrap'}).find(
- 'a').get('href')
- )
-
- except (AttributeError, IndexError):
- job['id'] = ''
- job['link'] = ''
-
- job['query'] = self.query
- job['provider'] = self.provider
-
- # key by id
- self.scrape_data[str(job['id'])] = job
-
- # Do not change the order of the next three statements if you want date_filter to work
-
- # stores references to jobs in list to be used in blurb retrieval
- scrape_list = [i for i in self.scrape_data.values()]
- # converts job date formats into a standard date format
- post_date_from_relative_post_age(scrape_list)
- # apply job pre-filter before scraping blurbs
- super().pre_filter(self.scrape_data, self.provider)
-
- # checks if delay is set or not, then extracts blurbs from job links
- if self.delay_config is not None:
- # calls super class to run delay specific threading logic
- super().delay_threader(
- scrape_list, self.get_blurb_with_delay, self.parse_blurb, threads
- )
-
- else: # maps jobs to threads and cleans them up when done
- # start time recording
- start = time()
-
- # maps jobs to threads and cleans them up when done
- threads.map(self.search_joblink_for_blurb, scrape_list)
- threads.shutdown()
-
- # end and print recorded time
- end = time()
- print(f'{self.provider} scrape job took {(end - start):.3f}s')
diff --git a/jobfunnel/glassdoor_static.py b/jobfunnel/glassdoor_static.py
deleted file mode 100644
index 8c2a5d8f..00000000
--- a/jobfunnel/glassdoor_static.py
+++ /dev/null
@@ -1,272 +0,0 @@
-import re
-
-from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, wait
-from logging import info as log_info
-from math import ceil
-from time import sleep, time
-
-from .jobfunnel import JobFunnel, MASTERLIST_HEADER
-from .tools.tools import filter_non_printables
-from .tools.tools import post_date_from_relative_post_age
-from .glassdoor_base import GlassDoorBase
-
-
-class GlassDoorStatic(GlassDoorBase):
- def __init__(self, args):
- super().__init__(args)
- self.provider = 'glassdoorstatic'
- self.headers = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;'
- 'q=0.9,image/webp,*/*;q=0.8',
- 'accept-encoding': 'gzip, deflate, sdch, br',
- 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
- 'referer': 'https://www.glassdoor.{0}/'.format(
- self.search_terms['region']['domain']
- ),
- 'upgrade-insecure-requests': '1',
- 'user-agent': self.user_agent,
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive',
- }
- # Sets headers as default on Session object
- self.s.headers.update(self.headers)
- # Concatenates keywords with '-'
- self.query = ' '.join(self.search_terms['keywords'])
-
- def get_search_url(self, method='get'):
- """gets the glassdoor search url"""
- # form the location lookup request data
- data = {'term': self.search_terms['region']
- ['city'], 'maxLocationsToReturn': 10}
-
- # form the location lookup url
- location_url = 'https://www.glassdoor.co.in/findPopularLocationAjax.htm?'
-
- # get location id for search location
- location_response = self.s.post(
- # set location headers to override default session headers
- location_url, headers=self.location_headers, data=data
- ).json()
-
- if method == 'get':
- # @TODO implement get style for glassdoor
- raise NotImplementedError()
- elif method == 'post':
- # form the job search url
- search = (
- f'https://www.glassdoor.'
- f"{self.search_terms['region']['domain']}/Job/jobs.htm"
- )
-
- # form the job search data
- data = {
- 'clickSource': 'searchBtn',
- 'sc.keyword': self.query,
- 'locT': 'C',
- 'locId': location_response[0]['locationId'],
- 'jobType': '',
- 'radius': self.convert_radius(self.search_terms['region']['radius']),
- }
-
- return search, data
- else:
- raise ValueError(f'No html method {method} exists')
-
- def search_page_for_job_soups(self, page, url, job_soup_list):
- """function that scrapes the glassdoor page for a list of job soups"""
- log_info(f'getting glassdoor page {page} : {url}')
-
- job = BeautifulSoup(
- self.s.get(url).text, self.bs4_parser
- ).find_all('li', attrs={'class', 'jl'})
- job_soup_list.extend(job)
-
- def search_joblink_for_blurb(self, job):
- """function that scrapes the glassdoor job link for the blurb"""
- search = job['link']
- log_info(f'getting glassdoor search: {search}')
-
- job_link_soup = BeautifulSoup(
- self.s.get(search).text, self.bs4_parser
- )
-
- try:
- job['blurb'] = job_link_soup.find(
- id='JobDescriptionContainer').text.strip()
- except AttributeError:
- job['blurb'] = ''
-
- filter_non_printables(job)
-
- # split apart above function into two so gotten blurbs can be parsed
- # while others blurbs are being obtained
- def get_blurb_with_delay(self, job, delay):
- """gets blurb from glassdoor job link and sets delays for requests"""
- sleep(delay)
-
- search = job['link']
- log_info(f'delay of {delay:.2f}s, getting glassdoor search: {search}')
-
- res = self.s.get(search).text
- return job, res
-
- def scrape(self):
- """function that scrapes job posting from glassdoor and pickles it"""
- log_info(f'jobfunnel glassdoor to pickle running @ {self.date_string}')
-
- # get the search url and data
- search, data = self.get_search_url(method='post')
-
- # get the html data, initialize bs4 with lxml
- request_html = self.s.post(search, data=data)
-
- # create the soup base
- soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
-
- # scrape total number of results, and calculate the # pages needed
- num_res = soup_base.find(
- 'p', attrs={'class', 'jobsCount'}).text.strip()
- num_res = int(re.findall(r'(\d+)', num_res.replace(',', ''))[0])
- log_info(
- f'Found {num_res} glassdoor results for query=' f'{self.query}')
-
- pages = int(ceil(num_res / self.max_results_per_page))
-
- # init list of job soups
- job_soup_list = []
- # init threads
- threads = ThreadPoolExecutor(max_workers=8)
- # init futures list
- fts = []
-
- # search the pages to extract the list of job soups
- for page in range(1, pages + 1):
- if page == 1:
- fts.append( # append thread job future to futures list
- threads.submit(
- self.search_page_for_job_soups,
- page,
- request_html.url,
- job_soup_list,
- )
- )
- else:
- # gets partial url for next page
- part_url = (
- soup_base.find('li', attrs={'class', 'next'}).find(
- 'a').get('href')
- )
- # uses partial url to construct next page url
- page_url = re.sub(
- r'_IP\d+\.',
- '_IP' + str(page) + '.',
- f'https://www.glassdoor.'
- f"{self.search_terms['region']['domain']}"
- f'{part_url}',
- )
-
- fts.append( # append thread job future to futures list
- threads.submit(
- self.search_page_for_job_soups,
- page,
- page_url,
- job_soup_list,
- )
- )
- wait(fts) # wait for all scrape jobs to finish
-
- # make a dict of job postings from the listing briefs
- for s in job_soup_list:
- # init dict to store scraped data
- job = dict([(k, '') for k in MASTERLIST_HEADER])
-
- # scrape the post data
- job['status'] = 'new'
- try:
- # jobs should at minimum have a title, company and location
- job['title'] = (
- s.find('div', attrs={'class', 'jobContainer'})
- .find(
- 'a',
- attrs={'class', 'jobLink jobInfoItem jobTitle'},
- recursive=False,
- )
- .text.strip()
- )
- job['company'] = s.find(
- 'div', attrs={'class', 'jobInfoItem jobEmpolyerName'}
- ).text.strip()
- job['location'] = s.get('data-job-loc')
- except AttributeError:
- continue
-
- # set blurb to none for now
- job['blurb'] = ''
-
- try:
- labels = s.find_all('div', attrs={'class', 'jobLabel'})
- job['tags'] = '\n'.join(
- [l.text.strip() for l in labels if l.text.strip() != 'New']
- )
- except AttributeError:
- job['tags'] = ''
-
- try:
- job['date'] = (
- s.find('div', attrs={'class', 'jobLabels'})
- .find('span', attrs={'class', 'jobLabel nowrap'})
- .text.strip()
- )
- except AttributeError:
- job['date'] = ''
-
- try:
- part_url = (
- s.find('div', attrs={'class', 'logoWrap'}).find(
- 'a').get('href')
- )
- job['id'] = s.get('data-id')
- job['link'] = (
- f'https://www.glassdoor.'
- f"{self.search_terms['region']['domain']}"
- f'{part_url}'
- )
-
- except (AttributeError, IndexError):
- job['id'] = ''
- job['link'] = ''
-
- job['query'] = self.query
- job['provider'] = self.provider
-
- # key by id
- self.scrape_data[str(job['id'])] = job
-
- # Do not change the order of the next three statements if you want date_filter to work
-
- # stores references to jobs in list to be used in blurb retrieval
- scrape_list = [i for i in self.scrape_data.values()]
- # converts job date formats into a standard date format
- post_date_from_relative_post_age(scrape_list)
- # apply job pre-filter before scraping blurbs
- super().pre_filter(self.scrape_data, self.provider)
-
- # checks if delay is set or not, then extracts blurbs from job links
- if self.delay_config is not None:
- # calls super class to run delay specific threading logic
- super().delay_threader(
- scrape_list, self.get_blurb_with_delay, self.parse_blurb, threads
- )
-
- else: # maps jobs to threads and cleans them up when done
- # start time recording
- start = time()
-
- # maps jobs to threads and cleans them up when done
- threads.map(self.search_joblink_for_blurb, scrape_list)
- threads.shutdown()
-
- # end and print recorded time
- end = time()
- print(f'{self.provider} scrape job took {(end - start):.3f}s')
diff --git a/jobfunnel/indeed.py b/jobfunnel/indeed.py
deleted file mode 100644
index 53fab9a1..00000000
--- a/jobfunnel/indeed.py
+++ /dev/null
@@ -1,342 +0,0 @@
-import re
-
-from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, wait
-from logging import info as log_info
-from math import ceil
-from time import sleep, time
-
-from .jobfunnel import JobFunnel, MASTERLIST_HEADER
-from .tools.tools import filter_non_printables
-from .tools.tools import post_date_from_relative_post_age
-
-
-class Indeed(JobFunnel):
-
- def __init__(self, args):
- super().__init__(args)
- self.provider = 'indeed'
- self.max_results_per_page = 50
- self.headers = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;'
- 'q=0.9,image/webp,*/*;q=0.8',
- 'accept-encoding': 'gzip, deflate, sdch, br',
- 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
- 'referer': 'https://www.indeed.{0}/'.format(
- self.search_terms['region']['domain']),
- 'upgrade-insecure-requests': '1',
- 'user-agent': self.user_agent,
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive'
- }
- # Sets headers as default on Session object
- self.s.headers.update(self.headers)
- # Concatenates keywords with '+' and encodes spaces as '+'
- self.query = '+'.join(self.search_terms['keywords']).replace(' ', '+')
-
- def convert_radius(self, radius):
- """function that quantizes the user input radius to a valid radius
- value: 5, 10, 15, 25, 50, 100, and 200 kilometers or miles"""
- if radius < 5:
- radius = 0
- elif 5 <= radius < 10:
- radius = 5
- elif 10 <= radius < 15:
- radius = 10
- elif 15 <= radius < 25:
- radius = 15
- elif 25 <= radius < 50:
- radius = 25
- elif 50 <= radius < 100:
- radius = 50
- elif radius >= 100:
- radius = 100
- return radius
-
- def get_search_url(self, method='get'):
- """gets the indeed search url"""
- if method == 'get':
- # form job search url
- search = ('https://www.indeed.{0}/jobs?'
- 'q={1}&l={2}%2C+{3}&radius={4}&limit={5}&filter={6}'.format(
- self.search_terms['region']['domain'],
- self.query,
- self.search_terms['region']['city'].replace(' ', '+'),
- self.search_terms['region']['province'],
- self.convert_radius(
- self.search_terms['region']['radius']),
- self.max_results_per_page,
- int(self.similar_results)))
-
- return search
- elif method == 'post':
- # @TODO implement post style for indeed
- raise NotImplementedError()
- else:
- raise ValueError(f'No html method {method} exists')
-
- def search_page_for_job_soups(self, search, page, job_soup_list):
- """function that scrapes the indeed page for a list of job soups"""
- url = f'{search}&start={int(page * self.max_results_per_page)}'
- log_info(f'getting indeed page {page} : {url}')
-
- jobs = BeautifulSoup(
- self.s.get(url).text, self.bs4_parser). \
- find_all('div', attrs={'data-tn-component': 'organicJob'})
-
- job_soup_list.extend(jobs)
-
- def search_joblink_for_blurb(self, job):
- """function that scrapes the indeed job link for the blurb"""
- search = job['link']
- log_info(f'getting indeed page: {search}')
-
- job_link_soup = BeautifulSoup(
- self.s.get(search).text, self.bs4_parser)
-
- try:
- job['blurb'] = job_link_soup.find(
- id='jobDescriptionText').text.strip()
- except AttributeError:
- job['blurb'] = ''
-
- filter_non_printables(job)
-
- def get_blurb_with_delay(self, job, delay):
- """gets blurb from indeed job link and sets delays for requests"""
- sleep(delay)
-
- search = job['link']
- log_info(f'delay of {delay:.2f}s, getting indeed search: {search}')
-
- res = self.s.get(search).text
- return job, res
-
- def parse_blurb(self, job, html):
- """parses and stores job description into dict entry"""
- job_link_soup = BeautifulSoup(html, self.bs4_parser)
-
- try:
- job['blurb'] = job_link_soup.find(
- id='jobDescriptionText').text.strip()
- except AttributeError:
- job['blurb'] = ''
-
- filter_non_printables(job)
-
- def get_num_pages_to_scrape(self, soup_base, max=0):
- """
- Calculates the number of pages to be scraped.
- Args:
- soup_base: a BeautifulSoup object with the html data.
- At the moment this method assumes that the soup_base was prepared statically.
- max: the maximum number of pages to be scraped.
- Returns:
- The number of pages to be scraped.
- If the number of pages that soup_base yields is higher than max, then max is returned.
- """
- num_res = soup_base.find(id='searchCountPages').contents[0].strip()
- num_res = int(re.findall(r'f (\d+) ', num_res.replace(',', ''))[0])
- number_of_pages = int(ceil(num_res / self.max_results_per_page))
- if max == 0:
- return number_of_pages
- elif number_of_pages < max:
- return number_of_pages
- else:
- return max
-
- def get_title(self, soup):
- """
- Fetches the title from a BeautifulSoup base.
- Args:
- soup: BeautifulSoup base to scrape the title from.
- Returns:
- The job title scraped from soup.
- Note that this function may throw an AttributeError if it cannot find the title.
- The caller is expected to handle this exception.
- """
- return soup.find('a', attrs={
- 'data-tn-element': 'jobTitle'}).text.strip()
-
- def get_company(self, soup):
- """
- Fetches the company from a BeautifulSoup base.
- Args:
- soup: BeautifulSoup base to scrape the company from.
- Returns:
- The company scraped from soup.
- Note that this function may throw an AttributeError if it cannot find the company.
- The caller is expected to handle this exception.
- """
- return soup.find('span', attrs={
- 'class': 'company'}).text.strip()
-
- def get_location(self, soup):
- """
- Fetches the job location from a BeautifulSoup base.
- Args:
- soup: BeautifulSoup base to scrape the location from.
- Returns:
- The job location scraped from soup.
- Note that this function may throw an AttributeError if it cannot find the location.
- The caller is expected to handle this exception.
- """
- return soup.find('span', attrs={
- 'class': 'location'}).text.strip()
-
- def get_tags(self, soup):
- """
- Fetches the job location from a BeautifulSoup base.
- Args:
- soup: BeautifulSoup base to scrape the location from.
- Returns:
- The job location scraped from soup.
- Note that this function may throw an AttributeError if it cannot find the location.
- The caller is expected to handle this exception.
- """
- table = soup.find(
- 'table', attrs={'class': 'jobCardShelfContainer'}). \
- find_all('td', attrs={'class': 'jobCardShelfItem'})
- return "\n".join([td.text.strip() for td in table])
-
- def get_date(self, soup):
- """
- Fetches the job date from a BeautifulSoup base.
- Args:
- soup: BeautifulSoup base to scrape the date from.
- Returns:
- The job date scraped from soup.
- Note that this function may throw an AttributeError if it cannot find the date.
- The caller is expected to handle this exception.
- """
- return soup.find('span', attrs={
- 'class': 'date'}).text.strip()
-
- def get_id(self, soup):
- """
- Fetches the job id from a BeautifulSoup base.
- Args:
- soup: BeautifulSoup base to scrape the id from.
- Returns:
- The job id scraped from soup.
- Note that this function may throw an AttributeError if it cannot find the id.
- The caller is expected to handle this exception.
- """
- # id regex quantifiers
- id_regex = re.compile(r'id=\"sj_([a-zA-Z0-9]*)\"')
- return id_regex.findall(str(soup.find('a', attrs={
- 'class': 'sl resultLink save-job-link'})))[0]
-
- def get_link(self, job_id):
- """
- Constructs the link with the given job_id.
- Args:
- job_id: The id to be used to construct the link for this job.
- Returns:
- The constructed job link.
- Note that this function does not check the correctness of this link.
- The caller is responsible for checking correcteness.
- """
- return (f"http://www.indeed."
- f"{self.search_terms['region']['domain']}"
- f"/viewjob?jk={job_id}")
-
- def scrape(self):
- """function that scrapes job posting from indeed and pickles it"""
- log_info(f'jobfunnel indeed to pickle running @ {self.date_string}')
-
- # get the search url
- search = self.get_search_url()
-
- # get the html data, initialize bs4 with lxml
- request_html = self.s.get(search)
-
- # create the soup base
- soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
-
- # parse total results, and calculate the # of pages needed
- pages = self.get_num_pages_to_scrape(soup_base)
- log_info(f'Found {pages} indeed results for query='
- f'{self.query}')
-
- # init list of job soups
- job_soup_list = []
- # init threads
- threads = ThreadPoolExecutor(max_workers=8)
- # init futures list
- fts = []
-
- # scrape soups for all the pages containing jobs it found
- for page in range(0, pages):
- fts.append( # append thread job future to futures list
- threads.submit(self.search_page_for_job_soups,
- search, page, job_soup_list))
- wait(fts) # wait for all scrape jobs to finish
-
- # make a dict of job postings from the listing briefs
- for s in job_soup_list:
- # init dict to store scraped data
- job = dict([(k, '') for k in MASTERLIST_HEADER])
-
- # scrape the post data
- job['status'] = 'new'
- try:
- # jobs should at minimum have a title, company and location
- job['title'] = self.get_title(s)
- job['company'] = self.get_company(s)
- job['location'] = self.get_location(s)
- except AttributeError:
- continue
-
- job['blurb'] = ''
-
- try:
- job['tags'] = self.get_tags(s)
- except AttributeError:
- job['tags'] = ''
-
- try:
- job['date'] = self.get_date(s)
- except AttributeError:
- job['date'] = ''
-
- try:
- job['id'] = self.get_id(s)
- job['link'] = self.get_link(job['id'])
-
- except (AttributeError, IndexError):
- job['id'] = ''
- job['link'] = ''
-
- job['query'] = self.query
- job['provider'] = self.provider
-
- # key by id
- self.scrape_data[str(job['id'])] = job
-
- # stores references to jobs in list to be used in blurb retrieval
- scrape_list = [i for i in self.scrape_data.values()]
-
- # converts job date formats into a standard date format
- post_date_from_relative_post_age(scrape_list)
-
- # apply job pre-filter before scraping blurbs
- super().pre_filter(self.scrape_data, self.provider)
-
- # checks if delay is set or not, then extracts blurbs from job links
- if self.delay_config is not None:
- # calls super class to run delay specific threading logic
- super().delay_threader(scrape_list, self.get_blurb_with_delay,
- self.parse_blurb, threads)
-
- else:
- # start time recording
- start = time()
-
- # maps jobs to threads and cleans them up when done
- threads.map(self.search_joblink_for_blurb, scrape_list)
- threads.shutdown()
-
- # end and print recorded time
- end = time()
- print(f'{self.provider} scrape job took {(end - start):.3f}s')
diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py
deleted file mode 100755
index e6d90f9c..00000000
--- a/jobfunnel/jobfunnel.py
+++ /dev/null
@@ -1,393 +0,0 @@
-# Paul McInnis 2018
-# writes pickles to master list path and applies search filters
-
-import csv
-import json
-import logging
-import os
-import pickle
-import random
-import re
-import sys
-
-from collections import OrderedDict
-from concurrent.futures import as_completed
-from datetime import date
-from time import time
-from typing import Dict, List
-from requests import Session
-
-from .tools.delay import delay_alg
-from .tools.filters import tfidf_filter, id_filter, date_filter
-from .tools.tools import proxy_dict_to_url
-
-# setting job status to these words removes them from masterlist + adds to
-# blacklist
-REMOVE_STATUSES = ['archive', 'archived', 'remove', 'rejected']
-
-# csv header
-MASTERLIST_HEADER = ['status', 'title', 'company', 'location', 'date',
- 'blurb', 'tags', 'link', 'id', 'provider', 'query']
-
-# user agent list
-USER_AGENT_LIST = os.path.normpath(
- os.path.join(os.path.dirname(__file__), 'text/user_agent_list.txt'))
-
-
-class JobFunnel(object):
- """class that writes pickles to master list path and applies search
- filters """
-
- def __init__(self, args):
- # The maximum number of days old a job can be
- self.max_listing_days = args['max_listing_days']
- # paths
- self.master_list_path = args['master_list_path']
- self.filterlist_path = args['filter_list_path']
- self.blacklist = args['black_list']
- self.logfile = args['log_path']
- self.loglevel = args['log_level']
- self.pickles_dir = args['data_path']
- self.duplicate_list_path = args['duplicate_list_path']
-
- # other inits
- self.filterlist = None
- self.similar_results = args['similar']
- self.save_dup = args['save_duplicates']
- self.bs4_parser = 'lxml'
- self.scrape_data = {}
-
- # user agent init
- user_agent_list = []
- with open(USER_AGENT_LIST) as file:
- for line in file:
- li = line.strip()
- if li and not li.startswith("#"):
- user_agent_list.append(line.rstrip('\n'))
- self.user_agent = random.choice(user_agent_list)
-
- # date string for pickle files
- self.date_string = date.today().strftime("%Y-%m-%d")
-
- # search term configuration data
- self.search_terms = args['search_terms']
-
- # set delay settings if they exist
- self.delay_config = None
- if args['delay_config'] is not None:
- self.delay_config = args['delay_config']
-
- # set session with (potential proxy)
- self.s = Session()
-
- # set proxy if given
- if args['proxy'] is not None:
- self.s.proxies = {
- args['proxy']['protocol']: proxy_dict_to_url(args['proxy'])
- }
-
- # create data dir
- if not os.path.exists(args['data_path']):
- os.makedirs(args['data_path'])
-
- def init_logging(self):
- # initialise logging to file
- self.logger = logging.getLogger()
- self.logger.setLevel(self.loglevel)
- logging.basicConfig(filename=self.logfile, level=self.loglevel)
- if self.loglevel == 20:
- logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
- else:
- logging.getLogger().addHandler(logging.StreamHandler())
-
- self.logger.info(f'jobfunnel initialized at {self.date_string}')
-
- def get_search_url(self, method='get'):
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def get_title():
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def get_company():
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def get_location():
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def get_tags():
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def get_date():
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def get_id():
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def get_link():
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def get_number_of_pages():
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def scrape(self):
- """function to be implemented by child classes"""
- raise NotImplementedError()
-
- def load_pickle(self, args):
- """function to load today's daily scrape pickle"""
- # only to be used in no_scrape mode
- pickle_filepath = os.path.join(args['data_path'],
- f'jobs_{self.date_string}.pkl')
- try:
- self.scrape_data = pickle.load(open(pickle_filepath, 'rb'))
- except FileNotFoundError as e:
- logging.error(f'{pickle_filepath} not found! Have you scraped '
- f'any jobs today?')
- raise e
-
- def load_pickles(self, args):
- """function to load all historic daily scrape pickles"""
- # only to be used in recovery mode
- pickle_found = False
- pickle_path = os.path.join(args['data_path'])
- for root, dirs, files in os.walk(pickle_path):
- for file in files:
- if re.findall(r'jobs_.*', file):
- if not pickle_found:
- pickle_found = True
- pickle_file = file
- pickle_filepath = os.path.join(pickle_path, pickle_file)
- logging.info(f'loading pickle file: {pickle_filepath}')
- self.scrape_data.update(
- pickle.load(open(pickle_filepath, 'rb')))
- if not pickle_found:
- logging.error(f'no pickles found in {pickle_path}!'
- f' Have you scraped any jobs?')
- raise Exception
-
- def dump_pickle(self):
- """function to dump a pickle of the daily scrape dict"""
- pickle_name = f'jobs_{self.date_string}.pkl'
- pickle.dump(self.scrape_data,
- open(os.path.join(self.pickles_dir, pickle_name), 'wb'))
-
- def read_csv(self, path, key_by_id=True):
- # reads csv passed in as path
- with open(path, 'r', encoding='utf8', errors='ignore') as csvfile:
- reader = csv.DictReader(csvfile)
- if key_by_id:
- return dict([(j['id'], j) for j in reader])
- else:
- return [row for row in reader]
-
- def write_csv(self, data, path, fieldnames=MASTERLIST_HEADER):
- # writes data [dict(),..] to a csv at path
- with open(path, 'w', encoding='utf8') as csvfile:
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
- writer.writeheader()
- for row in data:
- writer.writerow(data[row])
-
- def remove_jobs_in_filterlist(self, data: Dict[str, dict]):
- # load the filter-list if it exists, apply it to remove scraped jobs
- if data == {}:
- raise ValueError('No scraped job data to filter')
-
- if os.path.isfile(self.filterlist_path):
- self.filterlist = json.load(open(self.filterlist_path, 'r'))
- n_filtered = 0
- for jobid in self.filterlist:
- if jobid in data:
- data.pop(jobid)
- n_filtered += 1
- logging.info(f'removed {n_filtered} jobs present in filter-list')
- else:
- if hasattr(self, 'provider'):
- pass
- else:
- self.logger.warning(f'no jobs filtered, '
- f'missing {self.filterlist_path}')
-
- def remove_blacklisted_companies(self, data: Dict[str, dict]):
- # remove blacklisted companies from the scraped data
- # @TODO allow people to add companies to this via 'blacklist' status
- blacklist_ids = []
- for job_id, job_data in data.items():
- if job_data['company'] in self.blacklist:
- blacklist_ids.append(job_id)
- logging.info(f'removed {len(blacklist_ids)} jobs '
- f'in blacklist from master-list')
- for job_id in blacklist_ids:
- data.pop(job_id)
-
- def update_filterjson(self):
- # parse master .csv file into an update for the filter-list json file
- if os.path.isfile(self.master_list_path):
- # load existing filtered jobs, if any
- if os.path.isfile(self.filterlist_path):
- filtered_jobs = json.load(open(self.filterlist_path, 'r'))
- else:
- filtered_jobs = {}
-
- # add jobs from csv that need to be filtered away, if any
- for job in self.read_csv(self.master_list_path, key_by_id=False):
- if job['status'] in REMOVE_STATUSES:
- if job['id'] not in filtered_jobs:
- logging.info('added {} to {}'.format(
- job['id'], self.filterlist_path))
- filtered_jobs[job['id']] = job
-
- # write out complete list with any additions from the masterlist
- with open(self.filterlist_path, 'w', encoding='utf8') as outfile:
- outfile.write(
- json.dumps(
- filtered_jobs,
- indent=4,
- sort_keys=True,
- separators=(',', ': '),
- ensure_ascii=False))
-
- # update class attribute
- self.filterlist = filtered_jobs
- else:
- logging.warning("no master-list, filter-list was not updated")
-
- def pre_filter(self, data: Dict[str, dict], provider):
- """function called by child classes that applies multiple filters
- before getting job blurbs"""
- # call date_filter if it is turned on
- if self.max_listing_days is not None:
- date_filter(data, self.max_listing_days)
- # call id_filter for master and duplicate lists, if they exist
- if os.path.isfile(self.master_list_path):
- id_filter(data, self.read_csv(self.master_list_path),
- provider)
- if os.path.isfile(self.duplicate_list_path):
- id_filter(data, self.read_csv(
- self.duplicate_list_path), provider)
-
- # filter out scraped jobs we have rejected, archived or blacklisted
- try:
- self.remove_jobs_in_filterlist(data)
- except ValueError:
- pass
-
- self.remove_blacklisted_companies(data)
-
- def delay_threader(self,
- scrape_list: List[Dict], scrape_fn, parse_fn, threads):
- """function called by child classes to thread scrapes jobs
- with delays"""
- if not scrape_list:
- raise ValueError('No jobs to scrape')
- # calls delaying algorithm
- print("Calculating delay...")
- delays = delay_alg(len(scrape_list), self.delay_config)
- print("Done! Starting scrape!")
- # zips delays and scrape list as jobs for thread pool
- scrape_jobs = zip(scrape_list, delays)
- # start time recording
- start = time()
- # submits jobs and stores futures in dict
- results = {threads.submit(scrape_fn, job, delays): job['id']
- for job, delays in scrape_jobs}
-
- # loops through futures and removes each if successfully parsed
- while results:
- # parses futures as they complete
- for future in as_completed(results):
- try:
- job, html = future.result()
- parse_fn(job, html)
- del results[future]
- del html
- except Exception as e:
- self.logger.error(f'Blurb Future Error: {e}')
- pass
-
-
- threads.shutdown() # clean up threads when done
- # end and print recorded time
- end = time()
- print(f'{self.provider} scrape job took {(end - start):.3f}s')
-
- def update_masterlist(self):
- """use the scraped job listings to update the master spreadsheet"""
- if self.scrape_data == {}:
- raise ValueError('No scraped jobs, cannot update masterlist')
-
- # converts scrape data to ordered dictionary to filter all duplicates
- self.scrape_data = OrderedDict(sorted(self.scrape_data.items(),
- key=lambda t: t[1]['tags']))
- # filter out scraped jobs we have rejected, archived or blacklisted
- self.remove_jobs_in_filterlist(self.scrape_data)
- self.remove_blacklisted_companies(self.scrape_data)
-
- # load and update existing masterlist
- try:
- # open masterlist if it exists & init updated masterlist
- masterlist = self.read_csv(self.master_list_path)
-
- # update masterlist to remove filtered/blacklisted jobs
- self.remove_jobs_in_filterlist(masterlist)
- self.remove_blacklisted_companies(masterlist)
-
- # update masterlist to contain only new (unique) listings
- if self.save_dup: # if true, saves duplicates to own file
- # calls tfidf filter and returns popped duplicate list
- duplicate_list = tfidf_filter(self.scrape_data, masterlist)
-
- logging.info(f'Saving {len(duplicate_list)} duplicates jobs to'
- f' {self.duplicate_list_path}')
- # checks if duplicate list has entries
- if len(duplicate_list) > 0:
- # checks if duplicate_list.csv exists
- if os.path.isfile(self.duplicate_list_path):
- # loads and adds current duplicates to list
- master_dup = self.read_csv(self.duplicate_list_path)
- master_dup.update(duplicate_list)
- self.write_csv(data=master_dup,
- path=self.duplicate_list_path)
- else:
- # saves duplicates to duplicates_list.csv
- self.write_csv(data=duplicate_list,
- path=self.duplicate_list_path)
- else:
- tfidf_filter(self.scrape_data, masterlist)
-
- masterlist.update(self.scrape_data)
-
- # save
- self.write_csv(data=masterlist, path=self.master_list_path)
-
- except FileNotFoundError:
- # run tfidf filter on initial scrape
- if self.save_dup: # if true saves duplicates to own file
- duplicate_list = tfidf_filter(self.scrape_data)
-
- logging.info(
- f'Saving {len(duplicate_list)} duplicates jobs to '
- f'{self.duplicate_list_path}')
-
- if len(duplicate_list) > 0:
- # saves duplicates to duplicates_list.csv
- self.write_csv(data=duplicate_list,
- path=self.duplicate_list_path)
-
- else:
- tfidf_filter(self.scrape_data)
-
- # dump the results into the data folder as the masterlist
- self.write_csv(data=self.scrape_data, path=self.master_list_path)
- logging.info(
- f'no masterlist detected, added {len(self.scrape_data.keys())}'
- f' jobs to {self.master_list_path}')
diff --git a/jobfunnel/monster.py b/jobfunnel/monster.py
deleted file mode 100644
index 76dd984e..00000000
--- a/jobfunnel/monster.py
+++ /dev/null
@@ -1,242 +0,0 @@
-import re
-
-from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor
-from logging import info as log_info
-from math import ceil
-from time import sleep, time
-
-from .jobfunnel import JobFunnel, MASTERLIST_HEADER
-from .tools.tools import filter_non_printables
-from .tools.tools import post_date_from_relative_post_age
-
-
-class Monster(JobFunnel):
-
- def __init__(self, args):
- super().__init__(args)
- self.provider = 'monster'
- self.max_results_per_page = 25
- self.headers = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;'
- 'q=0.9,image/webp,*/*;q=0.8',
- 'accept-encoding': 'gzip, deflate, sdch, br',
- 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
- 'referer': 'https://www.monster.{0}/'.format(
- self.search_terms['region']['domain']),
- 'upgrade-insecure-requests': '1',
- 'user-agent': self.user_agent,
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive'
- }
- # Sets headers as default on Session object
- self.s.headers.update(self.headers)
- # Concatenates keywords with '-' and encodes spaces as '-'
- self.query = '-'.join(self.search_terms['keywords']).replace(' ', '-')
-
- def convert_radius(self, radius):
- """function that quantizes the user input radius to a valid radius
- in either kilometers or miles"""
- if self.search_terms['region']['domain'] == 'com':
- if radius < 5:
- radius = 0
- elif 5 <= radius < 10:
- radius = 5
- elif 10 <= radius < 20:
- radius = 10
- elif 20 <= radius < 30:
- radius = 20
- elif 30 <= radius < 40:
- radius = 30
- elif 40 <= radius < 50:
- radius = 40
- elif 50 <= radius < 60:
- radius = 50
- elif 60 <= radius < 75:
- radius = 60
- elif 75 <= radius < 100:
- radius = 75
- elif 100 <= radius < 150:
- radius = 100
- elif 150 <= radius < 200:
- radius = 150
- elif radius >= 200:
- radius = 200
- else:
- if radius < 5:
- radius = 0
- elif 5 <= radius < 10:
- radius = 5
- elif 10 <= radius < 20:
- radius = 10
- elif 20 <= radius < 50:
- radius = 20
- elif 50 <= radius < 100:
- radius = 50
- elif radius >= 100:
- radius = 100
-
- return radius
-
- def get_search_url(self, method='get'):
- """gets the monster request html"""
- # form job search url
- if method == 'get':
- search = ('https://www.monster.{0}/jobs/search/?'
- 'q={1}&where={2}__2C-{3}&intcid={4}&rad={5}&where={2}__2c-{3}'.format(
- self.search_terms['region']['domain'],
- self.query,
- self.search_terms['region']['city'].replace(' ', "-"),
- self.search_terms['region']['province'],
- 'skr_navigation_nhpso_searchMain',
- self.convert_radius(self.search_terms['region']['radius'])))
-
- return search
- elif method == 'post':
- # @TODO implement post style for monster
- raise NotImplementedError()
- else:
- raise ValueError(f'No html method {method} exists')
-
- def search_joblink_for_blurb(self, job):
- """function that scrapes the monster job link for the blurb"""
- search = job['link']
- log_info(f'getting monster search: {search}')
-
- job_link_soup = BeautifulSoup(
- self.s.get(search).text, self.bs4_parser)
-
- try:
- job['blurb'] = job_link_soup.find(
- id='JobDescription').text.strip()
- except AttributeError:
- job['blurb'] = ''
-
- filter_non_printables(job)
-
- # split apart above function into two so gotten blurbs can be parsed
- # while others blurbs are being obtained
- def get_blurb_with_delay(self, job, delay):
- """gets blurb from monster job link and sets delays for requests"""
- sleep(delay)
-
- search = job['link']
- log_info(f'delay of {delay:.2f}s, getting monster search: {search}')
-
- res = self.s.get(search).text
- return job, res
-
- def parse_blurb(self, job, html):
- """parses and stores job description into dict entry"""
- job_link_soup = BeautifulSoup(html, self.bs4_parser)
-
- try:
- job['blurb'] = job_link_soup.find(
- id='JobDescription').text.strip()
- except AttributeError:
- job['blurb'] = ''
-
- filter_non_printables(job)
-
- def scrape(self):
- """function that scrapes job posting from monster and pickles it"""
- log_info(f'jobfunnel monster to pickle running @ {self.date_string}')
-
- # get the search url
- search = self.get_search_url()
-
- # get the html data, initialize bs4 with lxml
- request_html = self.s.get(search)
-
- # create the soup base
- soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
-
- # scrape total number of results, and calculate the # pages needed
- num_res = soup_base.find('h2', 'figure').text.strip()
- num_res = int(re.findall(r'(\d+)', num_res)[0])
- log_info(f'Found {num_res} monster results for query='
- f'{self.query}')
-
- pages = int(ceil(num_res / self.max_results_per_page))
- # scrape soups for all the pages containing jobs it found
- page_url = f'{search}&start={pages}'
- log_info(f'getting monster pages 1 to {pages} : {page_url}')
-
- jobs = BeautifulSoup(
- self.s.get(page_url).text, self.bs4_parser). \
- find_all('div', attrs={'class': 'flex-row'})
-
- job_soup_list = []
- job_soup_list.extend(jobs)
-
- # id regex quantifiers
- id_regex = re.compile(r'/((?:[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f'
- r']{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12})|\d+)')
-
- # make a dict of job postings from the listing briefs
- for s in job_soup_list:
- # init dict to store scraped data
- job = dict([(k, '') for k in MASTERLIST_HEADER])
-
- # scrape the post data
- job['status'] = 'new'
- try:
- # jobs should at minimum have a title, company and location
- job['title'] = s.find('h2', attrs={
- 'class': 'title'}).text.strip()
- job['company'] = s.find(
- 'div', attrs={'class': 'company'}).text.strip()
- job['location'] = s.find('div', attrs={
- 'class': 'location'}).text.strip()
- except AttributeError:
- continue
-
- # no blurb is available in monster job soups
- job['blurb'] = ''
- # tags are not supported in monster
- job['tags'] = ''
- try:
- job['date'] = s.find('time').text.strip()
- except AttributeError:
- job['date'] = ''
- # captures uuid or int ids, by extracting from url instead
- try:
- job['link'] = str(s.find('a', attrs={
- 'data-bypass': 'true'}).get('href'))
- job['id'] = id_regex.findall(job['link'])[0]
- except AttributeError:
- job['id'] = ''
- job['link'] = ''
-
- job['query'] = self.query
- job['provider'] = self.provider
-
- # key by id
- self.scrape_data[str(job['id'])] = job
-
- # Do not change the order of the next three statements if you want date_filter to work
-
- # stores references to jobs in list to be used in blurb retrieval
- scrape_list = [i for i in self.scrape_data.values()]
- # converts job date formats into a standard date format
- post_date_from_relative_post_age(scrape_list)
- # apply job pre-filter before scraping blurbs
- super().pre_filter(self.scrape_data, self.provider)
-
- threads = ThreadPoolExecutor(max_workers=8)
- # checks if delay is set or not, then extracts blurbs from job links
- if self.delay_config is not None:
- # calls super class to run delay specific threading logic
- super().delay_threader(scrape_list, self.get_blurb_with_delay,
- self.parse_blurb, threads)
- else:
- # start time recording
- start = time()
-
- # maps jobs to threads and cleans them up when done
- threads.map(self.search_joblink_for_blurb, scrape_list)
- threads.shutdown()
-
- # end and print recorded time
- end = time()
- print(f'{self.provider} scrape job took {(end - start):.3f}s')
diff --git a/jobfunnel/resources/__init__.py b/jobfunnel/resources/__init__.py
new file mode 100644
index 00000000..5f0c1620
--- /dev/null
+++ b/jobfunnel/resources/__init__.py
@@ -0,0 +1,2 @@
+from jobfunnel.resources.resources import *
+from jobfunnel.resources.enums import *
diff --git a/jobfunnel/resources/defaults.py b/jobfunnel/resources/defaults.py
new file mode 100644
index 00000000..47b58efa
--- /dev/null
+++ b/jobfunnel/resources/defaults.py
@@ -0,0 +1,47 @@
+"""Default arguments for both JobFunnelConfigManager and CLI arguments.
+NOTE: Not all defaults here are used, as we rely on YAML for demo and not kwargs
+"""
+import os
+from pathlib import Path
+from jobfunnel.resources.enums import Locale, DelayAlgorithm, Provider
+
+USER_HOME_DIRECTORY = os.path.abspath(str(Path.home()))
+
+DEFAULT_LOG_LEVEL_NAME = 'INFO'
+DEFAULT_LOCALE = Locale.CANADA_ENGLISH
+DEFAULT_CITY = 'Waterloo'
+DEFAULT_PROVINCE = 'ON'
+DEFAULT_SEARCH_KEYWORDS = ['Python']
+DEFAULT_COMPANY_BLOCK_LIST = []
+DEFAULT_OUTPUT_DIRECTORY = os.path.join(
+ USER_HOME_DIRECTORY, 'job_search_results'
+)
+DEFAULT_CACHE_DIRECTORY = os.path.join(DEFAULT_OUTPUT_DIRECTORY, '.cache')
+DEFAULT_BLOCK_LIST_FILE = os.path.join(DEFAULT_CACHE_DIRECTORY, 'block.json')
+DEFAULT_DUPLICATES_FILE = os.path.join(
+ DEFAULT_CACHE_DIRECTORY, 'duplicates.json'
+)
+DEFAULT_LOG_FILE = os.path.join(DEFAULT_OUTPUT_DIRECTORY, 'log.log')
+DEFAULT_MASTER_CSV_FILE = os.path.join(DEFAULT_OUTPUT_DIRECTORY, 'master.csv')
+DEFAULT_SEARCH_RADIUS = 25
+DEFAULT_MAX_LISTING_DAYS = 60
+DEFAULT_DELAY_MAX_DURATION = 5.0
+DEFAULT_DELAY_MIN_DURATION = 1.0
+DEFAULT_DELAY_ALGORITHM = DelayAlgorithm.LINEAR
+# FIXME: re-enable glassdoor once we fix issue with it. (#87)
+DEFAULT_PROVIDERS = [Provider.MONSTER, Provider.INDEED] #, Provider.GLASSDOOR]
+DEFAULT_PROVIDER_NAMES = [p.name for p in DEFAULT_PROVIDERS]
+DEFAULT_NO_SCRAPE = False
+DEFAULT_USE_WEB_DRIVER = False
+DEFAULT_RECOVER = False
+DEFAULT_RETURN_SIMILAR_RESULTS = False
+DEFAULT_SAVE_DUPLICATES = False
+DEFAULT_RANDOM_DELAY = False
+DEFAULT_RANDOM_CONVERGING_DELAY = False
+
+# Defaults we use from localization, the scraper can always override it.
+DEFAULT_DOMAIN_FROM_LOCALE = {
+ Locale.CANADA_ENGLISH: 'ca',
+ Locale.CANADA_FRENCH: 'ca',
+ Locale.USA_ENGLISH: 'com',
+}
diff --git a/jobfunnel/resources/enums.py b/jobfunnel/resources/enums.py
new file mode 100644
index 00000000..ac8cc0d3
--- /dev/null
+++ b/jobfunnel/resources/enums.py
@@ -0,0 +1,78 @@
+from enum import Enum
+
+
+class Locale(Enum):
+ """This will allow Scrapers / Filters / Main to identify the support they
+ have for different domains of different websites
+
+ Locale must be set as it defines the code implementation to use for forming
+ the correct GET requests, to allow us to interact with a job-source.
+ """
+ CANADA_ENGLISH = 1
+ CANADA_FRENCH = 2
+ USA_ENGLISH = 3
+
+
+class JobStatus(Enum):
+ """Job statuses that are built-into jobfunnel
+ NOTE: these are the only valid values for entries in 'status' in our CSV
+ """
+ UNKNOWN = 1
+ NEW = 2
+ ARCHIVE = 3
+ INTERVIEWING = 4
+ INTERVIEWED = 5
+ REJECTED = 6
+ ACCEPTED = 7
+ DELETE = 8
+ INTERESTED = 9
+ APPLIED = 10
+ APPLY = 11
+ OLD = 12
+
+
+class JobField(Enum):
+ """Fields of job that we need setters for, passed to Scraper.get(field=...)
+ """
+ TITLE = 0
+ COMPANY = 1
+ LOCATION = 2
+ DESCRIPTION = 3
+ KEY_ID = 4
+ URL = 5
+ LOCALE = 6
+ QUERY = 7
+ PROVIDER = 8
+ STATUS = 9
+ SCRAPE_DATE = 10
+ SHORT_DESCRIPTION = 11
+ POST_DATE = 12
+ RAW = 13
+ TAGS = 14
+ WAGE = 15
+ REMOTE = 16
+
+
+class DuplicateType(Enum):
+ """Ways in which a job can be a duplicate
+ NOTE: we use these to determine what action(s) to take for a duplicate
+ """
+ KEY_ID = 0
+ EXISTING_TFIDF = 1
+ NEW_TFIDF = 2
+
+
+class Provider(Enum):
+ """Job source providers
+ """
+ INDEED = 1
+ GLASSDOOR = 2
+ MONSTER = 3
+
+
+class DelayAlgorithm(Enum):
+ """delaying algorithms
+ """
+ CONSTANT = 1
+ SIGMOID = 2
+ LINEAR = 3
diff --git a/jobfunnel/resources/resources.py b/jobfunnel/resources/resources.py
new file mode 100644
index 00000000..ffddb64a
--- /dev/null
+++ b/jobfunnel/resources/resources.py
@@ -0,0 +1,38 @@
+"""String-like resouces and other constants are initialized here.
+"""
+import datetime
+import os
+import string
+
+# CSV header for output CSV. do not remove anything or you'll break usr's CSV's
+# TODO: need to add short and long descriptions (breaking change)
+CSV_HEADER = [
+ 'status', 'title', 'company', 'location', 'date', 'blurb', 'tags', 'link',
+ 'id', 'provider', 'query', 'locale', 'wage', 'remote',
+]
+
+LOG_LEVEL_NAMES = [
+ 'CRITICAL', 'FATAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'
+]
+
+MIN_DESCRIPTION_CHARS = 5 # If Job.description is less than this we fail valid.
+MAX_CPU_WORKERS = 8 # Maximum num threads we use when scraping
+MIN_JOBS_TO_PERFORM_SIMILARITY_SEARCH = 25 # Minimum # of jobs we need to TFIDF
+MAX_BLOCK_LIST_DESC_CHARS = 150 # Maximum len of description in block_list JSON
+DEFAULT_MAX_TFIDF_SIMILARITY = 0.75 # Maximum similarity between job text TFIDF
+
+BS4_PARSER = 'lxml'
+T_NOW = datetime.datetime.today() # NOTE: use today so we only compare days
+
+PRINTABLE_STRINGS = set(string.printable)
+
+# Load the user agent list once only.
+USER_AGENT_LIST_FILE = os.path.normpath(
+ os.path.join(os.path.dirname(__file__), 'user_agent_list.txt')
+)
+USER_AGENT_LIST = []
+with open(USER_AGENT_LIST_FILE) as file:
+ for line in file:
+ li = line.strip()
+ if li and not li.startswith("#"):
+ USER_AGENT_LIST.append(line.rstrip('\n'))
diff --git a/jobfunnel/text/user_agent_list.txt b/jobfunnel/resources/user_agent_list.txt
similarity index 98%
rename from jobfunnel/text/user_agent_list.txt
rename to jobfunnel/resources/user_agent_list.txt
index 6de5a410..97711f52 100644
--- a/jobfunnel/text/user_agent_list.txt
+++ b/jobfunnel/resources/user_agent_list.txt
@@ -1,5 +1,4 @@
-# user agent list
-# https://developers.whatismybrowser.com/useragents/explore/
+# User agent list: https://developers.whatismybrowser.com/useragents/explore/
# chrome
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36
@@ -52,4 +51,4 @@ Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)
-Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET
+Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET
diff --git a/jobfunnel/tools/delay.py b/jobfunnel/tools/delay.py
deleted file mode 100644
index 75f69efd..00000000
--- a/jobfunnel/tools/delay.py
+++ /dev/null
@@ -1,122 +0,0 @@
-"""
-Module for calculating random or non-random delay
-"""
-import sys
-
-from math import ceil, log, sqrt
-from numpy import arange
-from random import uniform
-from scipy.special import expit
-from typing import Dict, Union
-from logging import warning
-
-
-def _c_delay(list_len: int, delay: Union[int, float]):
- """ Sets single delay value to whole list.
-
- """
- delays = [delay] * list_len
- # sets incrementing offsets to the first 8 elements
- inc = .2 # Increment set to .2
- offset = len(delays[0:8]) / 5 # offset
- # checks if delay is < 1.5
- if delay < 1.5:
- # changes increment and offset, to prevent 0s and negative nums
- inc = delay / 8
- offset = float(len(delays[0:8])) * inc
- # division here is faster since they are both ints
- delays[0:8] = [(x - offset) + i * inc for i, x in enumerate(delays[0:8])]
- return delays
-
-
-def _lin_delay(list_len: int, delay: Union[int, float]):
- """ Calculates y=.2*x and sets y=delay at intersection of x between lines.
-
- """
- # calculates x value where lines intersect
- its = 5 * delay # its = intersection
- # any delay of .2 or less is hard delay
- if its <= 1:
- return _c_delay(list_len, delay)
- else:
- # prevents slicing from breaking if delay is a float
- if isinstance(its, float):
- its = int(ceil(its))
- # create list of x values based on scrape list size
- delays = [*range(list_len)]
- delays[0:its] = [x / 5 for x in delays[0:its]]
- delays[its:] = [delay] * (len(delays) - its)
- return delays
-
-
-# https://en.wikipedia.org/wiki/Generalised_logistic_function
-def _sig_delay(list_len: int, delay: Union[int, float]):
- """ Calculates Richards/Sigmoid curve for delay.
-
- """
- gr = sqrt(delay) * 4 # growth rate
- y_0 = log(4 * delay) # Y(0)
- # calculates sigmoid curve using vars rewritten to be our x
- delays = delay * expit(arange(list_len) / gr - y_0)
- return delays.tolist() # convert np array back to list
-
-
-def delay_alg(list_len, delay_config: Dict):
- """ Checks delay config and returns calculated delay list.
-
- Args:
- list_len: length of scrape job list
- delay_config: Delay configuration dictionary
-
- Returns:
- list of delay time matching length of scrape job list
- """
- if isinstance(list_len, list): # Prevents breaking if a list was passed
- list_len = len(list_len)
-
- # init and check numerical arguments
- delay = delay_config['delay']
- if delay <= 0:
- raise ValueError("\nYour delay is set to 0 or less.\nCancelling "
- "execution...")
-
- min_delay = delay_config['min_delay']
- if min_delay < 0 or min_delay >= delay:
- warning(
- "\nMinimum delay is below 0, or more than or equal to delay."
- "\nSetting to 0 and continuing execution."
- "\nIf this was a mistake, check your command line"
- " arguments or settings file. \n")
- min_delay = 0
-
- # delay calculations using specified equations
- if delay_config['function'] == 'constant':
- delay_calcs = _c_delay(list_len, delay)
- elif delay_config['function'] == 'linear':
- delay_calcs = _lin_delay(list_len, delay)
- elif delay_config['function'] == 'sigmoid':
- delay_calcs = _sig_delay(list_len, delay)
-
- # check if minimum delay is above 0 and less than last element
- if 0 < min_delay:
- # sets min_delay to values greater than itself in delay_calcs
- for i, n in enumerate(delay_calcs):
- if n > min_delay:
- break
- delay_calcs[i] = min_delay
-
- # outputs final list of delays rounded up to 3 decimal places
- if delay_config['random']: # check if random delay was specified
- # random.uniform(a, b) a = lower bound, b = upper bound
- if delay_config['converge']: # checks if converging delay is True
- # delay_calcs = lower bound, delay = upper bound
- delays = [round(uniform(x, delay), 3) for x in delay_calcs]
- else:
- # lb = lower bounds, delay_calcs = upper bound
- delays = [round(uniform(min_delay, x), 3) for x in delay_calcs]
-
- else:
- delays = [round(i, 3) for i in delay_calcs]
- # set first element to 0 so scrape starts right away
- delays[0] = 0
- return delays
diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py
deleted file mode 100644
index cd3fcfc1..00000000
--- a/jobfunnel/tools/filters.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import nltk
-import logging
-from datetime import datetime, date, timedelta
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from typing import Dict, Optional
-from numpy import delete as np_delete, max as np_max, fill_diagonal
-
-
-def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
- """Filter out jobs that are older than number_of_days
- The assumed date format is yyyy-mm-dd
- Args:
- cur_dict: today's job scrape dict
- number_of_days: how many days old a job can be
- """
- if number_of_days < 0 or cur_dict is None:
- return
- print("date_filter running")
- cur_job_ids = [job['id'] for job in cur_dict.values()]
- # calculate the oldest date a job can be
- threshold_date = datetime.now() - timedelta(days=number_of_days)
- for job_id in cur_job_ids:
- # get the date from job with job_id
- job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
- # if this job is older than threshold_date, delete it from current scrape
- if job_date < threshold_date:
- logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
- f" it is older than {number_of_days} days")
- del cur_dict[job_id]
-
-
-def id_filter(cur_dict: Dict[str, dict], prev_dict: Dict[str, dict], provider):
- """ Filter duplicates on job id per provider.
-
- Args:
- cur_dict: today's job scrape dict
- prev_dict: the existing master list job dict
- provider: job board used
-
- """
- # get job ids from scrape and master list by provider as lists
- cur_job_ids = [job['id'] for job in cur_dict.values()]
- prev_job_ids = [job['id'] for job in prev_dict.values()
- if job['provider'] == provider]
-
- # pop duplicate job ids from current scrape
- duplicate_ids = [cur_dict.pop(job_id)['id'] for job_id in cur_job_ids
- if job_id in prev_job_ids]
-
- # log duplicate ids
- logging.info(f'found {len(cur_dict.keys())} unique job ids and '
- f'{len(duplicate_ids)} duplicates from {provider}')
-
-
-def tfidf_filter(cur_dict: Dict[str, dict],
- prev_dict: Optional[Dict[str, dict]] = None,
- max_similarity: float = 0.75):
- """ Fit a tfidf vectorizer to a corpus of all listing's text.
-
- Args:
- cur_dict: today's job scrape dict
- prev_dict: the existing master list job dict
- max_similarity: threshold above which blurb similarity = duplicate
-
- Returns:
- list of duplicate job ids which were removed from cur_dict
- """
- # retrieve stopwords if not already downloaded
- try:
- stopwords = nltk.corpus.stopwords.words('english')
- except LookupError:
- nltk.download('stopwords', quiet=True)
- stopwords = nltk.corpus.stopwords.words('english')
-
- # init vectorizer
- vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True,
- analyzer='word', stop_words=stopwords)
-
- # init list to store duplicate ids
- duplicate_ids = {}
-
- if prev_dict is None:
- # get query words and ids as lists
- query_ids = [job['id'] for job in cur_dict.values()]
- query_words = [job['blurb'] for job in cur_dict.values()]
-
- # returns cosine similarity between jobs as square matrix (n,n)
- similarities = cosine_similarity(vectorizer.fit_transform(query_words))
- # fills diagonals with 0, so whole dict does not get popped
- fill_diagonal(similarities, 0)
- # init index
- index = 0
- # identifies duplicates and stores them in duplicate ids dictionary
- while True:
- # loop breaks when index is equal to matrix height
- if index == len(similarities):
- break
-
- # deletes row and column, every time a max is found for a job id
- if np_max(similarities[index]) >= max_similarity:
- # query ids are popped so index always matches correct element
- duplicate_ids.update(
- {query_ids[index]: cur_dict.pop(query_ids.pop(index))})
- # reduce matrix dimensions, (n-1, n-1)
- similarities = np_delete(similarities, index, axis=0)
- similarities = np_delete(similarities, index, axis=1)
-
- else: # increment index by one
- index += 1
- # log something
- logging.info(f'Found and removed {len(duplicate_ids.keys())} '
- f're-posts/duplicates via TFIDF cosine similarity!')
-
- else:
- # checks current scrape for re-posts/duplicates
- duplicate_ids = tfidf_filter(cur_dict)
-
- # get query words and ids as lists
- query_ids = [job['id'] for job in cur_dict.values()]
- query_words = [job['blurb'] for job in cur_dict.values()]
-
- # get reference words as list
- reference_words = [job['blurb'] for job in prev_dict.values()]
-
- # fit vectorizer to entire corpus
- vectorizer.fit(query_words + reference_words)
-
- # set reference tfidf for cosine similarity later
- references = vectorizer.transform(reference_words)
-
- # calculate cosine similarity between reference and current blurbs
- similarities = cosine_similarity(
- vectorizer.transform(query_words), references)
-
- # get duplicate job ids and pop them
- for sim, query_id in zip(similarities, query_ids):
- if np_max(sim) >= max_similarity:
- duplicate_ids.update({query_id: cur_dict.pop(query_id)})
-
- # log something
- logging.info(f'found {len(cur_dict.keys())} unique listings and '
- f'{len(duplicate_ids.keys())} duplicates '
- f'via TFIDF cosine similarity')
-
- # returns a dictionary of duplicates
- return duplicate_ids
diff --git a/jobfunnel/tools/tools.py b/jobfunnel/tools/tools.py
deleted file mode 100644
index 2412103d..00000000
--- a/jobfunnel/tools/tools.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import logging
-import re
-import string
-
-from copy import deepcopy
-from dateutil.relativedelta import relativedelta
-from datetime import datetime, timedelta
-
-from webdriver_manager.chrome import ChromeDriverManager
-from webdriver_manager.microsoft import IEDriverManager
-from webdriver_manager.microsoft import EdgeChromiumDriverManager
-from webdriver_manager.opera import OperaDriverManager
-from webdriver_manager.firefox import GeckoDriverManager
-
-from selenium import webdriver
-
-
-def filter_non_printables(job):
- """function that filters trailing characters in scraped strings"""
- # filter all of the weird characters some job postings have...
- printable = set(string.printable)
- job['title'] = ''.join(filter(lambda x: x in printable, job['title']))
- job['blurb'] = ''.join(filter(lambda x: x in printable, job['blurb']))
-
-
-def post_date_from_relative_post_age(job_list):
- """function that returns the post date from the relative post age"""
- # initialize list and store regex objects of date quantifiers
- date_regex = [re.compile(r'(\d+)(?:[ +]{1,3})?(?:hour|hr)'),
- re.compile(r'(\d+)(?:[ +]{1,3})?(?:day|d)'),
- re.compile(r'(\d+)(?:[ +]{1,3})?month'),
- re.compile(r'(\d+)(?:[ +]{1,3})?year'),
- re.compile(r'[tT]oday|[jJ]ust [pP]osted'),
- re.compile(r'[yY]esterday')]
-
- for job in job_list:
- if not job['date']:
- return job['date']
-
- post_date = None
-
- # supports almost all formats like 7 hours|days and 7 hr|d|+d
- try:
- # hours old
- hours_ago = date_regex[0].findall(job['date'])[0]
- post_date = datetime.now() - timedelta(hours=int(hours_ago))
- except IndexError:
- # days old
- try:
- days_ago = \
- date_regex[1].findall(job['date'])[0]
- post_date = datetime.now() - timedelta(days=int(days_ago))
- except IndexError:
- # months old
- try:
- months_ago = \
- date_regex[2].findall(job['date'])[0]
- post_date = datetime.now() - relativedelta(
- months=int(months_ago))
- except IndexError:
- # years old
- try:
- years_ago = \
- date_regex[3].findall(job['date'])[0]
- post_date = datetime.now() - relativedelta(
- years=int(years_ago))
- except IndexError:
- # try phrases like today, just posted, or yesterday
- if date_regex[4].findall(
- job['date']) and not post_date:
- # today
- post_date = datetime.now()
- elif date_regex[5].findall(job['date']):
- # yesterday
- post_date = datetime.now() - timedelta(days=int(1))
- elif not post_date:
- # must be from the 1970's
- post_date = datetime(1970, 1, 1)
- logging.error(f"unknown date for job {job['id']}")
- # format date in standard format e.g. 2020-01-01
- job['date'] = post_date.strftime('%Y-%m-%d')
- # print('job['date']'')
-
-
-def split_url(url):
- # capture protocol, ip address and port from given url
- match = re.match(r'^(http[s]?):\/\/([A-Za-z0-9.]+):([0-9]+)?(.*)$', url)
-
- # if not all groups have a match, match will be None
- if match is not None:
- return {
- 'protocol': match.group(1),
- 'ip_address': match.group(2),
- 'port': match.group(3),
- }
- else:
- return None
-
-
-def proxy_dict_to_url(proxy_dict):
- protocol = proxy_dict['protocol']
- ip = proxy_dict['ip_address']
- port = proxy_dict['port']
-
- url_str = ''
- if protocol != '':
- url_str += protocol + '://'
- if ip != '':
- url_str += ip
- if port != '':
- url_str += ':' + port
-
- return url_str
-
-
-def change_nested_dict(data, args, val):
- """ Access nested dictionary using multiple arguments.
-
- https://stackoverflow.com/questions/10399614/accessing-value-inside-nested-dictionaries
- """
- if args and data:
- element = args[0]
- if element:
- if len(args) == 1:
- data[element] = val
- else:
- change_nested_dict(data[element], args[1:], val)
-
-
-def config_factory(base_config, attr_list):
- """ Create new config files from attribute dictionary.
-
- """
- configs = []
- for attr in attr_list:
- # create deep copy of nested dict
- config_cp = deepcopy(base_config)
-
- # change value and append
- change_nested_dict(config_cp, attr[0], attr[1])
- configs.append(config_cp)
-
- return configs
-
-
-def get_webdriver():
- """Get whatever webdriver is availiable in the system.
- webdriver_manager and selenium are currently being used for this.
- Supported browsers:[Firefox, Chrome, Opera, Microsoft Edge, Internet Expolorer]
- Returns:
- a webdriver that can be used for scraping. Returns None if we don't find a supported webdriver.
-
- """
- try:
- driver = webdriver.Firefox(
- executable_path=GeckoDriverManager().install())
- except Exception:
- try:
- driver = webdriver.Chrome(ChromeDriverManager().install())
- except Exception:
- try:
- driver = webdriver.Ie(IEDriverManager().install())
- except Exception:
- try:
- driver = webdriver.Opera(
- executable_path=OperaDriverManager().install())
- except Exception:
- try:
- driver = webdriver.Edge(
- EdgeChromiumDriverManager().install())
- except Exception:
- driver = None
- logging.error(
- "Your browser is not supported. Must have one of the following installed to scrape: [Firefox, Chrome, Opera, Microsoft Edge, Internet Expolorer]")
-
- return driver
diff --git a/images/svg/jobfunnel.svg b/logo/jobfunnel.svg
similarity index 100%
rename from images/svg/jobfunnel.svg
rename to logo/jobfunnel.svg
diff --git a/images/svg/jobfunnel_banner.svg b/logo/jobfunnel_banner.svg
similarity index 100%
rename from images/svg/jobfunnel_banner.svg
rename to logo/jobfunnel_banner.svg
diff --git a/readme.md b/readme.md
index 63c7f22f..baae3434 100644
--- a/readme.md
+++ b/readme.md
@@ -1,132 +1,105 @@
-
+
[![Build Status](https://travis-ci.com/PaulMcInnis/JobFunnel.svg?branch=master)](https://travis-ci.com/PaulMcInnis/JobFunnel)
[![Code Coverage](https://codecov.io/gh/PaulMcInnis/JobFunnel/branch/master/graph/badge.svg)](https://codecov.io/gh/PaulMcInnis/JobFunnel)
Automated tool for scraping job postings into a `.csv` file.
-----
-__*Note (Sept 5 2020)*__: If you are having trouble scraping jobs on current release, please try `ABCJobFunnel` branch and report any bugs you encounter! Current known issues discussion in thread here: [#90](https://github.com/PaulMcInnis/JobFunnel/pull/90)
-
-Install this branch via:
-```
-git clone git@github.com:PaulMcInnis/JobFunnel.git jobfunnelabc
-cd jobfunnelabc
-git checkout ABCJobFunnel
-cd ../
-pip install -e jobfunnelabc
-```
-----
-
### Benefits over job search sites:
* Never see the same job twice!
-* Browse all search results at once, in an easy to read/sort spreadsheet.
-* Keep track of all explicitly new job postings in your area.
-* See jobs from multiple job search sites all in one place.
-
-The spreadsheet for managing your job search:
+* No advertising.
+* See jobs from multiple job search websites all in one place.
+* Compare job search results between locations, and queries.
![masterlist.csv][masterlist]
-### Dependencies
-JobFunnel requires [Python][python] 3.6 or later.
-All dependencies are listed in `setup.py`, and can be installed automatically with `pip` when installing JobFunnel.
+# Installation
-### Installing JobFunnel
+_JobFunnel requires [Python][python] 3.8 or later._
```
pip install git+https://github.com/PaulMcInnis/JobFunnel.git
-funnel --help
```
-If you want to develop JobFunnel, you may want to install it in-place:
+# Usage
+By performing regular scraping and reviewing, you can cut through the noise of even the busiest job markets.
+
+## Configure
+You can search for jobs with YAML configuration files or by passing command arguments.
+Get started by customizing our demo [settings.yaml][demo_yaml] to suit your needs (or just run it as-is):
```
-git clone git@github.com:PaulMcInnis/JobFunnel.git jobfunnel
-pip install -e ./jobfunnel
-funnel --help
+wget https://raw.githubusercontent.com/PaulMcInnis/JobFunnel/master/demo/settings.yaml -O my_settings.yaml
```
+_NOTE:_
+* _It is recommended to provide as few search keywords as possible (i.e. `Python`, `AI`)._
-### Using JobFunnel
+* _JobFunnel currently only supports `CANADA_ENGLISH` and `USA_ENGLISH` locales._
-1. Set your job search preferences in the `yaml` configuration file (or use `-kw`).
-1. Run `funnel` to scrape all-available job listings.
-1. Review jobs in the master-list, update the job `status` to other values such as `interview` or `offer`.
-1. Set any undesired job `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`.
-1. Check out [demo/readme.md][demo] if you want to try the demo.
+## Scrape
-__*Note*__: `rejected` jobs will be filtered out and will disappear from the output `.csv`.
+Run `funnel` to populate your master CSV file with jobs from available providers:
-### Usage Notes
-
-* **Custom Status**
- Note that any custom states (i.e `applied`) are preserved in the spreadsheet.
+```
+funnel load -s my_settings.yaml
+```
-* **Running Filters**
- To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file.
+## Review
-* **Recovering Lost Master-list**
- If ever your master-list gets deleted you still have the historic pickle files.
- Simply run `funnel --recover` to generate a new master-list.
+Open the master CSV file and update the per-job `status`:
-* **Managing Multiple Searches**
- You can keep multiple search results across multiple `.csv` files:
- ```
- funnel -kw Python -o python_search
- funnel -kw AI Machine Learning -o ML_search
- ```
+* Set to `interested`, `applied`, `interview` or `offer` to reflect your progression on the job.
-* **Filtering Undesired Companies**
-Filter undesired companies by providing your own `yaml` configuration and adding them to the black list(see `JobFunnel/jobfunnel/config/settings.yaml`).
-
-* **Filtering Old Jobs**
- Filter jobs that you think are too old:
- `funnel -s JobFunnel/demo/settings.yaml --max_listing_days 30` will filter out job listings that are older than 30 days.
+* Set to `archive`, `rejected` or `delete` to remove a job from this search. You can review 'blocked' jobs within your `block_list_file`.
+# Advanced Usage
* **Automating Searches**
JobFunnel can be easily automated to run nightly with [crontab][cron]
For more information see the [crontab document][cron_doc].
-
- * **Glassdoor Notes**
- The `GlassDoor` scraper has two versions: `GlassDoorStatic` and `GlassDoorDynamic`. Both of these give you the same end result: they scrape GlassDoor and dump your job listings onto your `master_list.csv`. We recommend to *always* run `GlassDoorStatic` (this is the default preset we have on our demo `settings.yaml` file) because it is *a lot* faster than `GlassDoorDynamic`. However, given the event that `GlassDoorStatic` fails, you may use `GlassDoorDynamic`. It is very slow, but you'll still be able to scrape GlassDoor.
-
- When using `GlassDoorDynamic` Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA.
-
- You may also of course disable the Glassdoor scraper when using `GlassDoorDynamic` in your `settings.yaml` to not have to complete any CAPTCHA at all:
-```
- - 'Indeed'
- - 'Monster'
- # - 'GlassDoorStatic'
- # - 'GlassDoorDynamic'
-```
+
+* **Writing your own Scrapers**
+ If you have a job website you'd like to write a scraper for, you are welcome to implement it, Review the [Base Scraper][basescraper] for implementation details.
+
+* **Adding Support for X Language / Job Website**
+ JobFunnel supports scraping jobs from the same job website across locales & domains. If you are interested in adding support, you may only need to define session headers and domain strings, Review the [Base Scraper][basescraper] for further implementation details.
+
+* **Blocking Companies**
+ Filter undesired companies by adding them to your `company_block_list` in your YAML or pass them by command line as `-cbl`.
+
+* **Job Age Filter**
+ You can configure the maximum age of scraped listings (in days) by configuring `max_listing_days`.
* **Reviewing Jobs in Terminal**
You can review the job list in the command line:
```
column -s, -t < master_list.csv | less -#2 -N -S
```
-* **Saving Duplicates**
- You can save removed duplicates in a separate file, which is stored in the same place as your master list:
+
+* **Respectful Delaying**
+ Respectfully scrape your job posts with our built-in delaying algorithms.
+
+ To better understand how to configure delaying, check out [this Jupyter Notebook][delay_jp] which breaks down the algorithm step by step with code and visualizations.
+
+* **Recovering Lost Data**
+ JobFunnel can re-build your master CSV from your `cache_folder` where all the historic scrape data is located:
```
- funnel --save_dup
+ funnel --recover
```
-* **Respectful Delaying**
- Respectfully scrape your job posts with our built-in delaying algorithm, which can be configured using a config file (see `JobFunnel/jobfunnel/config/settings.yaml`) or with command line arguments:
- - `-d` lets you set your max delay value: `funnel -s demo/settings.yaml -kw AI -d 15`
- - `-r` lets you specify if you want to use random delaying, and uses `-d` to control the range of randoms we pull from: `funnel -s demo/settings.yaml -kw AI -r`
- - `-c` specifies converging random delay, which is an alternative mode of random delay. Random delay needed to be turned on as well for it to work. Proper usage would look something like this: `funnel -s demo/settings.yaml -kw AI -r -c`
- - `-md` lets you set a minimum delay value: `funnel -s demo/settings.yaml -d 15 -md 5`
- - `--fun` can be used to set which mathematical function (`constant`, `linear`, or `sigmoid`) is used to calculate delay: `funnel -s demo/settings.yaml --fun sigmoid`
- - `--no_delay` Turns off delaying, but it's usage is not recommended.
- To better understand how to configure delaying, check out [this Jupyter Notebook][delay_jp] breaking down the algorithm step by step with code and visualizations.
+* **Running by CLI**
+ You can run JobFunnel using CLI only, review the command structure via:
+ ```
+ funnel inline -h
+ ```
-[masterlist]:demo/assests/demo.png "masterlist.csv"
+[requirements]:requirements.txt
+[masterlist]:demo/demo.png "masterlist.csv"
+[demo_yaml]:demo/settings.yaml
[python]:https://www.python.org/
-[demo]:demo/readme.md
+[basescraper]:jobfunnel/backend/scrapers/base.py
[cron]:https://en.wikipedia.org/wiki/Cron
[cron_doc]:docs/crontab/readme.md
[conc_fut]:https://docs.python.org/dev/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor
diff --git a/requirements.txt b/requirements.txt
index 94a51871..f93c4ee9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,6 @@ scipy>=1.4.1
pytest>=5.3.1
pytest-mock>=3.1.1
selenium>=3.141.0
-webdriver-manager>=2.4.
\ No newline at end of file
+webdriver-manager>=2.4.0
+Cerberus>=1.3.2
+tqdm>=4.47.0
diff --git a/setup.py b/setup.py
index 2c18edbe..91f0fbbe 100644
--- a/setup.py
+++ b/setup.py
@@ -1,22 +1,28 @@
+"""Install JobFunnel as a package
+"""
from setuptools import setup, find_packages
from jobfunnel import __version__ as version
+
description = 'Automated tool for scraping job postings.'
url = 'https://github.com/PaulMcInnis/JobFunnel'
-requires = ['beautifulsoup4>=4.6.3',
- 'lxml>=4.2.4',
- 'requests>=2.19.1',
- 'python-dateutil>=2.8.0',
- 'PyYAML>=5.1',
- 'scikit-learn>=0.21.2',
- 'nltk>=3.4.1',
- 'scipy>=1.4.1',
- 'pytest>=5.3.1',
- 'pytest-mock>=3.1.1',
- 'selenium>=3.141.0',
- 'webdriver-manager>=2.4.0'
- ]
+requires = [
+ 'beautifulsoup4>=4.6.3',
+ 'lxml>=4.2.4',
+ 'requests>=2.19.1',
+ 'python-dateutil>=2.8.0',
+ 'PyYAML>=5.1',
+ 'scikit-learn>=0.21.2',
+ 'nltk>=3.4.1',
+ 'scipy>=1.4.1',
+ 'pytest>=5.3.1',
+ 'pytest-mock>=3.1.1',
+ 'selenium>=3.141.0',
+ 'webdriver-manager>=2.4.0',
+ 'Cerberus>=1.3.2',
+ 'tqdm>=4.47.0',
+]
with open('readme.md', 'r') as f:
readme = f.read()
@@ -27,12 +33,14 @@
description=description,
long_description=readme,
long_description_content_type='text/markdown',
- author='Paul McInnis, Bradley Kohler, Jose Alarcon, Erich Mengore, Mark van der Broek',
+ author='Paul McInnis, Bradley Kohler, Jose Alarcon, Erich Mengore, '
+ 'Mark van der Broek',
author_email='paulmcinnis99@gmail.com',
url=url,
license='MIT License',
- python_requires='>=3.6.0',
+ python_requires='>=3.8.0',
install_requires=requires,
- packages=find_packages(exclude=('demo', 'tests')),
+ packages=find_packages(exclude=('demo', 'tests', 'docs', 'images')),
include_package_data=True,
- entry_points={'console_scripts': ['funnel = jobfunnel.__main__:main']})
+ entry_points={'console_scripts': ['funnel = jobfunnel.__main__:main']}
+)
diff --git a/tests/test_glassdoor.py b/tests/backend/__init__.py
similarity index 100%
rename from tests/test_glassdoor.py
rename to tests/backend/__init__.py
diff --git a/tests/backend/scrapers/__init__.py b/tests/backend/scrapers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/backend/scrapers/test_glassdoor.py b/tests/backend/scrapers/test_glassdoor.py
new file mode 100644
index 00000000..d3b7fbfb
--- /dev/null
+++ b/tests/backend/scrapers/test_glassdoor.py
@@ -0,0 +1 @@
+# FIXME
\ No newline at end of file
diff --git a/tests/backend/scrapers/test_indeed.py b/tests/backend/scrapers/test_indeed.py
new file mode 100644
index 00000000..d3b7fbfb
--- /dev/null
+++ b/tests/backend/scrapers/test_indeed.py
@@ -0,0 +1 @@
+# FIXME
\ No newline at end of file
diff --git a/tests/backend/scrapers/test_monster.py b/tests/backend/scrapers/test_monster.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/backend/tools/__init__.py b/tests/backend/tools/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/backend/tools/test_delay.py b/tests/backend/tools/test_delay.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/backend/tools/test_filters.py b/tests/backend/tools/test_filters.py
new file mode 100644
index 00000000..d3b7fbfb
--- /dev/null
+++ b/tests/backend/tools/test_filters.py
@@ -0,0 +1 @@
+# FIXME
\ No newline at end of file
diff --git a/tests/backend/tools/test_tools.py b/tests/backend/tools/test_tools.py
new file mode 100644
index 00000000..623b65e9
--- /dev/null
+++ b/tests/backend/tools/test_tools.py
@@ -0,0 +1,2 @@
+# FIXME
+
diff --git a/tests/config/__init__.py b/tests/config/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/config/test_cli.py b/tests/config/test_cli.py
new file mode 100644
index 00000000..c0cb9e4d
--- /dev/null
+++ b/tests/config/test_cli.py
@@ -0,0 +1,86 @@
+"""Test CLI parsing --> config dict
+"""
+import os
+import pytest
+from jobfunnel.config import parse_cli, build_config_dict
+
+
+TEST_YAML = os.path.join('tests', 'data', 'test_config.yml')
+
+
+@pytest.mark.parametrize('argv, exp_exception', [
+ # Test schema from YAML
+ (['load', '-s', TEST_YAML], None),
+ # Test overrideable args
+ (['load', '-s', TEST_YAML, '-log-level', 'DEBUG'], None),
+ (['load', '-s', TEST_YAML, '-log-level', 'WARNING'], None),
+ (['load', '-s', TEST_YAML, '--no-scrape'], None),
+ # Test schema from CLI
+ (['inline', '-csv', 'TEST_search', '-log-level', 'DEBUG', '-cache',
+ 'TEST_cache', '-blf', 'TEST_block_list', '-dl', 'TEST_duplicates_list',
+ '-log-file', 'TEST_log_file', '-kw', 'I', 'Am', 'Testing', '-l',
+ 'CANADA_ENGLISH', '-ps', 'TESTPS', '-c', 'TestCity', '-cbl',
+ 'Blocked Company', 'Blocked Company 2', '-p', 'INDEED', 'MONSTER',
+ '-r', '42', '-max-listing-days', '44', '--similar-results', '--random',
+ '--converging', '-max', '8', '-min', '2', '-algorithm', 'LINEAR'], None),
+ # Invalid cases
+ (['load'], SystemExit),
+ (['load', '-csv', 'boo'], SystemExit),
+ (['inline', '-csv', 'TEST_search', '-log-level', 'DEBUG', '-cache',
+ 'TEST_cache', '-blf', 'TEST_block_list', '-dl',
+ 'TEST_duplicates_list'], SystemExit),
+ (['-csv', 'test.csv'], SystemExit),
+ (['-l',
+ 'CANADA_ENGLISH', '-ps', 'TESTPS', '-c', 'TestCity', '-cbl',
+ 'Blocked Company', 'Blocked Company 2', '-p', 'INDEED', 'MONSTER',
+ '-r', '42', '-max-listing-days', '44', '--similar-results', '--random',
+ '--converging', '-max', '8', '-min', '2', '-algorithm',
+ 'LINEAR'], SystemExit),
+])
+def test_parse_cli_build_config_dict(argv, exp_exception):
+ """Functional test to ensure that the CLI functions as we expect
+ TODO: break down into test_parse_cli and test_config_parser
+ FIXME: add exception message assertions
+ """
+ # FUT
+ if exp_exception:
+ with pytest.raises(exp_exception):
+ args = parse_cli(argv)
+ cfg = build_config_dict(args)
+ else:
+ args = parse_cli(argv)
+ cfg = build_config_dict(args)
+
+ # Assertions
+ assert cfg['master_csv_file'] == 'TEST_search'
+ assert cfg['cache_folder'] == 'TEST_cache'
+ assert cfg['block_list_file'] == 'TEST_block_list'
+ assert cfg['duplicates_list_file'] == 'TEST_duplicates_list'
+ assert cfg['search']['locale'] == 'CANADA_ENGLISH'
+ assert cfg['search']['providers'] == ['INDEED', 'MONSTER']
+ assert cfg['search']['province_or_state'] == 'TESTPS'
+ assert cfg['search']['city'] == 'TestCity'
+ assert cfg['search']['radius'] == 42
+ assert cfg['search']['keywords'] == ['I', 'Am', 'Testing']
+ assert cfg['search']['max_listing_days'] == 44
+ assert cfg['search']['company_block_list'] == ['Blocked Company',
+ 'Blocked Company 2']
+ if '-log-level' in argv:
+ # NOTE: need to always pass log level in same place for this cdtn
+ assert cfg['log_level'] == argv[4]
+ else:
+ assert cfg['log_level'] == 'INFO'
+ if '--no-scrape' in argv:
+ assert cfg['no_scrape']
+ else:
+ assert not cfg['no_scrape']
+ if '--similar-results' in argv:
+ assert cfg['search']['similar_results']
+ else:
+ assert not cfg['search']['similar_results']
+
+ assert cfg['delay']['algorithm'] == 'LINEAR'
+ assert cfg['delay']['max_duration'] == 8
+ assert cfg['delay']['min_duration'] == 2
+ assert cfg['delay']['random']
+ assert cfg['delay']['converging']
diff --git a/tests/config/test_delay.py b/tests/config/test_delay.py
new file mode 100644
index 00000000..011d26a9
--- /dev/null
+++ b/tests/config/test_delay.py
@@ -0,0 +1,39 @@
+"""Test the DelayConfig
+"""
+import pytest
+
+from jobfunnel.config import DelayConfig
+from jobfunnel.resources import DelayAlgorithm
+
+
+@pytest.mark.parametrize("max_duration, min_duration, invalid_dur", [
+ (1.0, 1.0, True),
+ (-1.0, 1.0, True),
+ (5.0, 0.0, True),
+ (5.0, 1.0, False),
+])
+@pytest.mark.parametrize("random, converge, invalid_rand", [
+ (True, True, False),
+ (True, False, False),
+ (False, True, True),
+])
+@pytest.mark.parametrize("delay_algorithm", (DelayAlgorithm.LINEAR, None))
+def test_delay_config_validate(max_duration, min_duration, invalid_dur,
+ delay_algorithm, random, converge, invalid_rand):
+ """Test DelayConfig
+ TODO: test messages too
+ """
+ cfg = DelayConfig(
+ max_duration=max_duration,
+ min_duration=min_duration,
+ algorithm=delay_algorithm,
+ random=random,
+ converge=converge,
+ )
+
+ # FUT
+ if invalid_dur or not delay_algorithm or invalid_rand:
+ with pytest.raises(ValueError):
+ cfg.validate()
+ else:
+ cfg.validate()
diff --git a/tests/config/test_manager.py b/tests/config/test_manager.py
new file mode 100644
index 00000000..a1ac20e3
--- /dev/null
+++ b/tests/config/test_manager.py
@@ -0,0 +1,30 @@
+# FIXME: need to break down config manager testing a bit more
+# @pytest.mark.parametrize('pass_del_cfg', (True, False))
+# def test_config_manager_init(mocker, pass_del_cfg):
+# """NOTE: unlike other configs this one validates itself on creation
+# """
+# # Mocks
+# patch_del_cfg = mocker.patch('jobfunnel.config.manager.DelayConfig')
+# patch_os = mocker.patch('jobfunnel.config.manager.os')
+# patch_os.path.exists.return_value = False # check it makes all paths
+# mock_master_csv = mocker.Mock()
+# mock_block_list = mocker.Mock()
+# mock_dupe_list = mocker.Mock()
+# mock_cache_folder = mocker.Mock()
+# mock_search_cfg = mocker.Mock()
+# mock_proxy_cfg = mocker.Mock()
+# mock_del_cfg = mocker.Mock()
+
+# # FUT
+# cfg = JobFunnelConfigManager(
+# master_csv_file=mock_master_csv,
+# user_block_list_file=mock_block_list,
+# duplicates_list_file=mock_dupe_list,
+# cache_folder=mock_cache_folder,
+# search_config=mock_search_cfg,
+# delay_config=mock_del_cfg if pass_del_cfg else None,
+# proxy_config=mock_proxy_cfg,
+# log_file='', # TODO optional?
+# )
+
+# # Assertions
diff --git a/tests/config/test_proxy.py b/tests/config/test_proxy.py
new file mode 100644
index 00000000..20261d62
--- /dev/null
+++ b/tests/config/test_proxy.py
@@ -0,0 +1,3 @@
+# FIXME
+# def test_proxy_config(protocol, ip_address, port):
+# pass
diff --git a/tests/config/test_search.py b/tests/config/test_search.py
new file mode 100644
index 00000000..c0787e3a
--- /dev/null
+++ b/tests/config/test_search.py
@@ -0,0 +1,60 @@
+"""Test the search config
+"""
+import pytest
+
+from jobfunnel.config import SearchConfig
+from jobfunnel.resources import Locale
+
+
+@pytest.mark.parametrize("keywords, exp_query_str", [
+ (['b33f', 'd3ad'], 'b33f d3ad'),
+ (['trumpet'], 'trumpet'),
+])
+def test_search_config_query_string(mocker, keywords, exp_query_str):
+ """Test that search config can build keyword query string correctly.
+ """
+ cfg = SearchConfig(
+ keywords=keywords,
+ province_or_state=mocker.Mock(),
+ locale=Locale.CANADA_FRENCH,
+ providers=mocker.Mock(),
+ )
+
+ # FUT
+ query_str = cfg.query_string
+
+ # Assertions
+ assert query_str == exp_query_str
+
+
+@pytest.mark.parametrize("locale, domain, exp_domain", [
+ (Locale.CANADA_ENGLISH, None, 'ca'),
+ (Locale.CANADA_FRENCH, None, 'ca'),
+ (Locale.USA_ENGLISH, None, 'com'),
+ (Locale.USA_ENGLISH, 'xyz', 'xyz'),
+ (None, None, None),
+])
+def test_search_config_init(mocker, locale, domain, exp_domain):
+ """Make sure the init functions as we expect wrt to domain selection
+ """
+ # FUT
+ if not locale:
+ # test our error
+ with pytest.raises(ValueError, match=r"Unknown domain for locale.*"):
+ cfg = SearchConfig(
+ keywords=mocker.Mock(),
+ province_or_state=mocker.Mock(),
+ locale=-1, # AKA an unknown Enum entry to Locale
+ providers=mocker.Mock(),
+ )
+ else:
+ cfg = SearchConfig(
+ keywords=mocker.Mock(),
+ province_or_state=mocker.Mock(),
+ locale=locale,
+ domain=domain,
+ providers=mocker.Mock(),
+ )
+
+ # Assertions
+ assert cfg.domain == exp_domain
diff --git a/tests/conftest.py b/tests/conftest.py
index f35b71b2..d3b7fbfb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,115 +1 @@
-import pytest
-import sys
-
-from unittest.mock import patch
-
-from jobfunnel.config.parser import parse_config
-from jobfunnel.tools.tools import config_factory
-from jobfunnel.__main__ import PROVIDERS
-from jobfunnel.jobfunnel import MASTERLIST_HEADER
-
-""" search_term_configs is a collection of search_terms configurations for all supported countries. If more countries are added to JobFunnel, one may add those new configurations to this variable and those new countries/domains will be tested without having to write new tests for them, assuming of course that one uses @pytest.mark.parametrize to feed search_term_configs to those new tests."""
-search_term_configs = [{'region': {'province': 'ON', 'city': 'waterloo', 'domain': 'ca', 'radius': 25}}, {
- 'region': {'province': '', 'city': 'new york', 'domain': 'com', 'radius': 25}}]
-
-
-@pytest.fixture()
-def configure_options():
- def setup(options: list):
- """Assigns the options to argv(as if JobFunnel were called from the command line with those options)
- and calls parse_config(). This fixture assumes that the test_parse module has been tested and passes.
- """
- with patch.object(sys, 'argv', options):
- config = parse_config()
- return config
-
- return setup
-
-
-@pytest.fixture()
-def job_listings():
- def setup(attr_list: list):
- """
- This function generates job listings.
- If attr_list is empty, then it returns a single job with
- the contents of job_format, which is a default job listing defined on this fixture.
- If attr_list is not empty, it returns a job listing for each attribute pair on attr_list.
- The expected format for each item on attr_list is
- [['key1', 'key2', 'keyN'], 'value']
- """
- job_format = {'status': 'new', 'title': 'Python Engineer', 'company': 'Python Corp', 'location': 'Waterloo, ON', 'date': '10 days ago', 'blurb': '', 'tags': '',
- 'link':
- 'https://job-openings.best-job-board.domain/python-engineer-waterloo-on-ca-pro'
- 'com/216808420', 'id': '216808420', 'provider': 'monster', 'query': 'Python'}
- if len(attr_list) > 0:
- return config_factory(job_format, attr_list)
- else:
- return job_format
- return setup
-
-
-@pytest.fixture()
-def per_id_job_listings(job_listings):
- def setup(attr_list: list, first_job_id: int = 0):
- """
- This function generates job_listings in the {'job_id':{job_listing}}
- fashion. This is particularly useful for functions like tfidf_filter that expect job listings in this format.
- Args:
- attr_list: an attribute list in the [['key1', 'key2', 'keyN'], 'value'] format.
- first_job_id: At what number to start generating job ids. This is particular useful when you want different job ids but the len of attr_list is the same across multiple calls to this function.
- Returns:
- A dictionary of the format {'job_id#1':{job_listing},'job_id#2':{job_listing},
- 'job_id#3':{job_listing}}. Please note that every job_id is unique.
- """
- job_list = job_listings(attr_list)
- new_job_id = first_job_id
- per_id_job_list = {}
- for job in job_list:
- job['id'] = str(new_job_id)
- per_id_job_list.update({job['id']: job})
- new_job_id += 1
- return per_id_job_list
- return setup
-
-
-@pytest.fixture()
-def init_scraper(configure_options):
- def setup(provider: str, options: list = ['']):
- """
- This function initializes a scraper(such as Indeed, Monster, etc) specified by provider.
- Hopefully it'll reduce some code duplication in tests.
- Args:
- provider: the provider to be inialized.
- Note that provider must match one of the keys defined for each scraper on the PROVIDERS dict on __main__.
- options: the options to be passed to the scraper, such as keywords, domain, etc.
- Note that only command-line options are accepted. Anything that needs to be tweaked that is not a command line option needs to be configured by the caller manually.
- Returns:
- An instance of the specified provider.
- """
- return PROVIDERS[provider](configure_options(options))
- return setup
-
-
-@pytest.fixture()
-def setup_scraper(init_scraper):
- def setup(scraper: str):
- """
- This fixture initializes the scraper state up until the point of
- having a BeautifulSoup list that can be used for scraping.
- This will help us avoid code duplication for tests.
- Args:
- scraper: The name of the scraper. Note that this name is used as a key for the PROVIDERS dict defined on __main__.py
- Returns:
- A dict of the form {'job_provider':provider,'job_list':job_soup_list, 'job_keys':job}.
- job_provider is the Indeed scraper object.
- job_soup_list is the list of BeautifulSoup objects that is ready to be scraped.
- job is a dict with all the keys from MASTERLIST_HEADER and empty values.
- """
- provider = init_scraper(scraper)
- # get the search url
- search = provider.get_search_url()
- job_soup_list = []
- provider.search_page_for_job_soups(search, 0, job_soup_list)
- job = dict([(k, '') for k in MASTERLIST_HEADER])
- return {'job_provider': provider, 'job_list': job_soup_list, 'job_keys': job}
- return setup
+# FIXME
\ No newline at end of file
diff --git a/tests/json/cities_america.json b/tests/data/cities_america.json
similarity index 100%
rename from tests/json/cities_america.json
rename to tests/data/cities_america.json
diff --git a/tests/json/cities_canada.json b/tests/data/cities_canada.json
similarity index 100%
rename from tests/json/cities_canada.json
rename to tests/data/cities_canada.json
diff --git a/tests/data/test_config.yml b/tests/data/test_config.yml
new file mode 100644
index 00000000..b47804a5
--- /dev/null
+++ b/tests/data/test_config.yml
@@ -0,0 +1,28 @@
+master_csv_file: TEST_search
+cache_folder: TEST_cache
+block_list_file: TEST_block_list
+duplicates_list_file: TEST_duplicates_list
+log_file: TEST_log_file
+search:
+ locale: CANADA_ENGLISH
+ providers:
+ - INDEED
+ - MONSTER
+ province_or_state: TESTPS
+ city: TestCity
+ radius: 42
+ keywords:
+ - I
+ - Am
+ - Testing
+ max_listing_days: 44
+ company_block_list:
+ - "Blocked Company"
+ - "Blocked Company 2"
+log_level: INFO
+delay:
+ algorithm: LINEAR
+ max_duration: 8
+ min_duration: 2
+ random: True
+ converging: True
diff --git a/tests/test_countries.py b/tests/test_countries.py
deleted file mode 100644
index ab40e2ee..00000000
--- a/tests/test_countries.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import pytest
-import os
-import re
-import sys
-import json
-import random
-
-from bs4 import BeautifulSoup
-from requests import get, post
-from typing import Union
-from unittest.mock import patch
-
-from jobfunnel.config.parser import parse_config
-from jobfunnel.indeed import Indeed
-from jobfunnel.monster import Monster
-from jobfunnel.glassdoor_static import GlassDoorStatic
-
-
-PROVIDERS = {'indeed': Indeed, 'monster': Monster,
- 'glassdoorstatic': GlassDoorStatic}
-
-# TODO: Test GlassdoorDynamic Provider
-
-DOMAINS = {'America': 'com', 'Canada': 'ca'}
-
-cities_america = os.path.normpath(
- os.path.join(os.path.dirname(__file__), 'json/cities_america.json'))
-cities_canada = os.path.normpath(
- os.path.join(os.path.dirname(__file__), 'json/cities_canada.json'))
-
-with open(cities_america, 'r') as file:
- cities_america = json.load(file)
-
-with open(cities_canada, 'r') as file:
- cities_canada = json.load(file)
-
-cities = cities_america + cities_canada
-test_size = 100
-if len(cities) < test_size:
- test_size = len(cities)
-
-# take a random sample of cities of size test_size
-cities = random.sample(cities, test_size)
-
-with patch.object(sys, 'argv', ['']):
- config = parse_config()
-
-
-@pytest.mark.xfail(strict=False)
-@pytest.mark.parametrize('city', cities)
-def test_cities(city, delay=1):
- """tests american city"""
- count = 0 # a count of providers with successful test cases
- for p in config['providers']:
- provider: Union[GlassDoorStatic, Monster,
- Indeed] = PROVIDERS[p](config)
- provider.search_terms['region']['domain'] = DOMAINS[city['country']]
- provider.search_terms['region']['province'] = city['abbreviation']
- provider.search_terms['region']['city'] = city['city']
- if isinstance(provider, Indeed):
- # get search url
- search = provider.get_search_url()
-
- # get the html data, initialize bs4 with lxml
- request_html = get(search, headers=provider.headers)
- elif isinstance(provider, Monster):
- # get search url
- search = provider.get_search_url()
-
- # get the html data, initialize bs4 with lxml
- request_html = get(search, headers=provider.headers)
- elif isinstance(provider, GlassDoorStatic):
- try:
- # get search url
- search, data = provider.get_search_url(method='post')
- except IndexError:
- # sometimes glassdoor does not find the location id
- continue
-
- # get the html data, initialize bs4 with lxml
- request_html = post(search, headers=provider.headers, data=data)
- else:
- raise TypeError(
- f'Type {type(provider)} does not match any of the providers.')
-
- # create the soup base
- soup_base = BeautifulSoup(request_html.text, provider.bs4_parser)
-
- # parse the location text field
- where = None # initialize location variable
- location = ', '.join([city['city'], city['abbreviation']])
- location = re.sub("['-]", '', location)
- if isinstance(provider, Indeed):
- where = soup_base.find(id='where')['value'].strip()
- elif isinstance(provider, Monster):
- where = soup_base.find(id='location')['value'].strip()
- elif isinstance(provider, GlassDoorStatic):
- where = soup_base.find(id='sc.location')['value']
-
- if where.lower() == location.lower():
- count += 1
-
- # assert that at least one provider found the correct location
- assert count > 0
diff --git a/tests/test_delay.py b/tests/test_delay.py
deleted file mode 100644
index 8b6bd53e..00000000
--- a/tests/test_delay.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import pytest
-from jobfunnel.tools.tools import config_factory
-from jobfunnel.tools.delay import delay_alg
-
-# Define mock data for this test module
-
-linear_delay = [0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8]
-sigmoid_delay = [0, 0.263, 0.284, 0.307,
- 0.332, 0.358, 0.386, 0.417, 0.449, 0.485]
-constant_delay = [0, 8.6, 8.8, 9.0, 9.2, 9.4, 9.6, 9.8, 10.0, 10.0]
-random_delay = [0, 5, 5, 5, 5, 5, 5, 5, 5, 5]
-job_list = ['job1', 'job2', 'job3', 'job4', 'job5',
- 'job6', 'job7', 'job8', 'job9', 'job10']
-
-
-# mock random.uniform to get constant values
-
-
-def mock_rand_uniform(a, b):
- return 5
-
-
-@pytest.mark.parametrize('function, expected_result', [('linear', linear_delay), ('sigmoid', sigmoid_delay), ('constant', constant_delay)])
-class TestClass:
-
- # test linear, constant and sigmoid delay
- # This test considers configurations with random and converge fields
- @pytest.mark.parametrize('random,converge', [(True, True), (True, False), (False, False)])
- def test_delay_alg(self, configure_options, function, expected_result, random, converge, monkeypatch):
- config = configure_options([''])
- config['delay_config']['random'] = random
- config['delay_config']['function'] = function
- config['delay_config']['converge'] = converge
- if random:
- monkeypatch.setattr(
- 'jobfunnel.tools.delay.uniform', mock_rand_uniform)
- expected_result = random_delay
- else:
- config['delay_config']['min_delay'] = 0
- delay_result = delay_alg(10, config['delay_config'])
- assert delay_result == expected_result
-
- # test linear, constant and sigmoid delay with a negative min_delay
-
- def test_delay_alg_negative_min_delay(self, configure_options, function, expected_result):
- config = configure_options([''])
- config['delay_config']['random'] = False
- config['delay_config']['function'] = function
- config['delay_config']['min_delay'] = -2
- delay_result = delay_alg(10, config['delay_config'])
- assert delay_result == expected_result
-
- # test linear, constant and sigmoid delay when min_delay is greater than the delay
-
- def test_delay_alg_min_delay_greater_than_delay(self, configure_options, function, expected_result):
- config = configure_options([''])
- config['delay_config']['random'] = False
- config['delay_config']['function'] = function
- # Set the delay value to its default
- config['delay_config']['delay'] = 10
- config['delay_config']['min_delay'] = 15
- delay_result = delay_alg(10, config['delay_config'])
- assert delay_result == expected_result
-
- # test linear, constant and sigmoid delay with negative delay
-
- def test_delay_alg_negative_delay(self, configure_options, function, expected_result):
- config = configure_options([''])
- config['delay_config']['random'] = False
- config['delay_config']['function'] = function
- config['delay_config']['min_delay'] = 0
- config['delay_config']['delay'] = -2
- with pytest.raises(ValueError) as e:
- delay_result = delay_alg(10, config['delay_config'])
- assert str(
- e.value) == "\nYour delay is set to 0 or less.\nCancelling execution..."
-
- # test linear, constant and sigmoid delay with random and a list as input
-
- def test_delay_alg_list_linear(self, configure_options, function, expected_result):
- config = configure_options([''])
- config['delay_config']['random'] = False
- config['delay_config']['function'] = function
- config['delay_config']['min_delay'] = 0
- delay_result = delay_alg(job_list, config['delay_config'])
- assert delay_result == expected_result
diff --git a/tests/test_filters.py b/tests/test_filters.py
deleted file mode 100644
index cb64d7cd..00000000
--- a/tests/test_filters.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import pytest
-
-from collections import OrderedDict
-from datetime import datetime, timedelta
-from unittest.mock import patch
-
-from jobfunnel.tools.filters import tfidf_filter, id_filter, date_filter
-
-
-attr_list = [[['blurb'], 'Looking for a passionate team player that is willing to learn new technologies. Our company X is still growing at an exponential rate. In order to be a perfect fit'
- ' you must tell us your favorite movie at the interview; favorite food; and a fun fact about yourself. The ideal fit will also know Python and SDLC.'],
- [['blurb'], 'Looking for a passionate developer that is willing to learn new technologies. Our company X is still growing at an exponential rate. In order to be a perfect fit'
- ' you must tell us your favorite movie at the interview; favorite food; and your favorite programming langauge. The ideal candiadate will also know Python and SDLC.'],
- [['blurb'], 'We make the best ice cream in the world. Our company still young and growing. We have stable funding and a lot of crazy ideas to make our company grow. The ideal candidate should like ice cream.'],
- [['blurb'], 'We make the best ice cream in the world. Our company still young and growing. We have stable funding and a lot of crazy ideas to make our company grow. The ideal candidate should love ice cream and all things ice cream.'],
- ]
-
-
-def test_date_filter(per_id_job_listings):
- new_job_listings = per_id_job_listings([attr_list[0], attr_list[1]])
- # assign two different dates to the job_postings
- job_date = datetime.now() - timedelta(days=10)
- new_job_listings['0']['date'] = job_date.strftime('%Y-%m-%d')
- job_date = datetime.now() - timedelta(days=3)
- new_job_listings['1']['date'] = job_date.strftime('%Y-%m-%d')
- date_filter(new_job_listings, 5)
- # assert that that jobs older than 5 days have been removed
- assert list(new_job_listings) == ['1']
-
-
-def test_id_filter(per_id_job_listings):
- new_job_listings = per_id_job_listings([attr_list[0], attr_list[2]])
- # generate job listings with the same ids as new_job_listings
- previous_job_listings = per_id_job_listings([attr_list[1], attr_list[3]])
- id_filter(new_job_listings, previous_job_listings,
- new_job_listings['0']['provider'])
- # assert that the new job listings have been removed since they already exist
- assert len(new_job_listings) == 0
- # assert that the correct job ids are in the new filtered new_job_listings
- assert list(previous_job_listings) == ['0', '1']
-
-
-def test_tfidf_filter_no_previous_scrape(per_id_job_listings):
- new_job_listings = per_id_job_listings(attr_list[0:4])
- tfidf_filter(new_job_listings)
- # assert that the correct job ids are in the new filtered new_job_listings
- assert list(new_job_listings) == ['1', '3']
-
-
-def test_tfidf_filter_with_previous_scrape(per_id_job_listings):
- new_job_listings = per_id_job_listings([attr_list[0], attr_list[2]])
- # generate job listings with different job ids than new_job_listings
- previous_job_listings = per_id_job_listings(
- [attr_list[1], attr_list[3]], first_job_id=2)
- tfidf_filter(new_job_listings, previous_job_listings)
- # assert that the new job listings have been removed since they already exist
- assert len(new_job_listings) == 0
- # assert that the correct job ids are in the new filtered new_job_listings
- assert list(previous_job_listings) == ['2', '3']
diff --git a/tests/test_indeed.py b/tests/test_indeed.py
deleted file mode 100644
index b42dac8b..00000000
--- a/tests/test_indeed.py
+++ /dev/null
@@ -1,244 +0,0 @@
-from jobfunnel.indeed import Indeed
-from jobfunnel.tools.delay import delay_alg
-import pytest
-from bs4 import BeautifulSoup
-import re
-from .conftest import search_term_configs
-
-
-#test the correctness of search_tems since our tests depend on it
-
-def test_search_terms(init_scraper):
- indeed = init_scraper('indeed')
- assert indeed.search_terms == { 'region': {'province':'ON',
- 'city':'waterloo', 'domain':'ca', 'radius':25}, 'keywords':['Python']}
-
-@pytest.mark.parametrize('search_terms_config', search_term_configs)
-class TestClass():
-
- def test_convert_radius(self, init_scraper, search_terms_config):
- provider = init_scraper('indeed')
- provider.search_terms = search_terms_config
- assert 0 == provider.convert_radius(-1)
- assert 0 == provider.convert_radius(3)
- assert 5 == provider.convert_radius(7)
- assert 10 == provider.convert_radius(12)
- assert 15 == provider.convert_radius(20)
- assert 25 == provider.convert_radius(37)
- assert 50 == provider.convert_radius(75)
- assert 100 == provider.convert_radius(300)
-
-
- def test_get_search_url(self, init_scraper, search_terms_config):
- provider = init_scraper('indeed')
- provider.search_terms = search_terms_config
- if(provider.search_terms['region']['domain'] == 'ca'):
- assert'https://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url()
- with pytest.raises(ValueError) as e:
- provider.get_search_url('panda')
- assert str(e.value) == 'No html method panda exists'
- with pytest.raises(NotImplementedError) as e:
- provider.get_search_url('post')
-
-
- def test_get_num_pages_to_scrape(self, init_scraper, search_terms_config):
- provider = init_scraper('indeed')
- provider.search_terms = search_terms_config
- # get the search url
- search = provider.get_search_url()
-
- # get the html data, initialize bs4 with lxml
- request_html = provider.s.get(search, headers=provider.headers)
-
- # create the soup base
- soup_base = BeautifulSoup(request_html.text, provider.bs4_parser)
- assert provider.get_num_pages_to_scrape(soup_base, max=3) <= 3
-
-
- def test_search_page_for_job_soups(self, init_scraper, search_terms_config):
- provider = init_scraper('indeed')
- provider.search_terms = search_terms_config
- # get the search url
- search = provider.get_search_url()
-
- # get the html data, initialize bs4 with lxml
- request_html = provider.s.get(search, headers=provider.headers)
- job_soup_list = []
- provider.search_page_for_job_soups(search, 0, job_soup_list)
- assert 0 < len(job_soup_list)
-
-
-# test the process of fetching title data from a job
-
- def test_get_title(self, setup_scraper, search_terms_config):
- scraper = setup_scraper('indeed')
- job_soup_list = scraper['job_list']
- job = scraper['job_keys']
- provider = scraper['job_provider']
- provider.search_terms = search_terms_config
- for soup in job_soup_list:
- try:
- job['title'] = provider.get_title(soup)
- except AttributeError:
- continue
- if(0 < len(job['title'])):
- assert True
- return
- assert False
-
-
-# test the process of fetching company data from a job
-
- def test_get_company(self, setup_scraper, search_terms_config):
- scraper = setup_scraper('indeed')
- job_soup_list = scraper['job_list']
- job = scraper['job_keys']
- provider = scraper['job_provider']
- provider.search_terms = search_terms_config
- for soup in job_soup_list:
- try:
- job['company'] = provider.get_company(soup)
- except AttributeError:
- continue
- if(0 < len(job['company'])):
- assert True
- return
- assert False
-
-
-# test the process of fetching location data from a job
-
- def test_get_location(self, setup_scraper, search_terms_config):
- scraper = setup_scraper('indeed')
- job_soup_list = scraper['job_list']
- job = scraper['job_keys']
- provider = scraper['job_provider']
- provider.search_terms = search_terms_config
- for soup in job_soup_list:
- try:
- job['location'] = provider.get_location(soup)
- except AttributeError:
- continue
- if(0 < len(job['location'])):
- assert True
- return
- assert False
-
-
-# test the process of fetching date data from a job
-
- def test_get_date(self, setup_scraper, search_terms_config):
- scraper = setup_scraper('indeed')
- job_soup_list = scraper['job_list']
- job = scraper['job_keys']
- provider = scraper['job_provider']
- provider.search_terms = search_terms_config
- for soup in job_soup_list:
- try:
- job['date'] = provider.get_date(soup)
- except AttributeError:
- continue
- if(0 < len(job['date'])):
- assert True
- return
- assert False
-
-# Test the id with a strict assertion because without a job id we have
-# no job link, and without job link, we have no job to apply to
- def test_get_id(self, setup_scraper, search_terms_config):
- scraper = setup_scraper('indeed')
- job_soup_list = scraper['job_list']
- job = scraper['job_keys']
- provider = scraper['job_provider']
- provider.search_terms = search_terms_config
- for soup in job_soup_list:
- try:
- job['id'] = provider.get_id(soup)
- except:
- assert False
- assert True
-
-
-# test the process of fetching the link to a job
-
- def test_get_link(self, setup_scraper, search_terms_config):
- scraper = setup_scraper('indeed')
- job_soup_list = scraper['job_list']
- job = scraper['job_keys']
- provider = scraper['job_provider']
- provider.search_terms = search_terms_config
- for soup in job_soup_list:
- try:
- job['id'] = provider.get_id(soup)
- job['link'] = provider.get_link(job['id'])
- except AttributeError:
- continue
- if(0 < len(job['link'])):
- assert True
- return
-
- assert False
-
-
-# test the process of fetching the blurb from a job
-
- def test_get_blurb_with_delay(self, setup_scraper, search_terms_config):
- """
- Tests whether the process of fetching blurb data is working.
- """
- scraper = setup_scraper('indeed')
- provider = scraper['job_provider']
- job_soup_list = scraper['job_list']
- job = scraper['job_keys']
- provider.search_terms = search_terms_config
- for soup in job_soup_list:
- try:
- job['id'] = provider.get_id(soup)
- job['link'] = provider.get_link(job['id'])
- res_job, html = provider.get_blurb_with_delay(job, delay_alg(
- len(job_soup_list), provider.delay_config)[0])
- provider.parse_blurb(job, html)
- except AttributeError:
- continue
- if(0 < len(job['blurb'])):
- assert True
- return
-
- assert False
-
-
-
- def test_search_joblink_for_blurb(self, setup_scraper, search_terms_config):
- """
- Tests whether the process of fetching blurb data is working.
- This test assumes that no delay configuration has been set.
- """
- scraper = setup_scraper('indeed')
- provider = scraper['job_provider']
- job_soup_list = scraper['job_list']
- job = scraper['job_keys']
- provider.delay_config = None
- provider.search_terms = search_terms_config
- for soup in job_soup_list:
- try:
- job['id'] = provider.get_id(soup)
- job['link'] = provider.get_link(job['id'])
- provider.search_joblink_for_blurb(job)
- except AttributeError:
- continue
- if(0 < len(job['blurb'])):
- assert True
- return
-
- assert False
-
-
- # Test the entire integration
-
- def test_scrape(self, init_scraper, mocker,
- search_terms_config):
- # ensure that we don't scrape more than one page
- mocker.patch('jobfunnel.indeed.Indeed.get_num_pages_to_scrape', return_value=1)
- provider = init_scraper('indeed')
- provider.search_terms = search_terms_config
- provider.scrape()
diff --git a/tests/test_parse.py b/tests/test_parse.py
deleted file mode 100644
index 86dd276e..00000000
--- a/tests/test_parse.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import pytest
-import sys
-import os
-import yaml
-
-from pathlib import Path
-from unittest.mock import patch
-
-from jobfunnel.config.parser import parse_config, parse_cli, cli_to_yaml, update_yaml, check_config_types, log_levels
-
-
-config_dict = {
- 'output_path': 'fish',
- 'providers': ['Indeed', 'Monster'],
- 'search_terms': {
- 'region': {
- 'state': 'NY',
- 'city': 'New York',
- 'domain': 'com',
- }
- }
-}
-
-config_dict_fail = {
- 'this_should_fail': False
-}
-
-cli_options = [
- ['', '-s', 'settings.yaml'],
- ['', '-o', '.'],
- ['', '-kw', 'java', 'python'],
- ['', '-p', 'ON'],
- ['', '--city', 'New York'],
- ['', '--domain', 'com'],
- ['', '-r'],
- ['', '-c'],
- ['', '-d', '20'],
- ['', '-md', '10'],
- ['', '--fun', 'linear'],
- ['', '--log_level', 'info'],
- ['', '--similar'],
- ['', '--no_scrape'],
- # US proxy grabbed from https://www.free-proxy-list.net/
- ['', '--proxy', 'http://50.193.9.202:53888'],
- ['', '--recover'],
- ['', '--save_dup'],
- ['', '--max_listing_days', '30'],
-]
-
-
-# test parse_cli with all command line options
-
-@pytest.mark.parametrize('option', cli_options)
-def test_parse_cli_pass(option):
- with patch.object(sys, 'argv', option):
- config = parse_cli()
-
-
-# test Parse_cli with an invalid argument
-
-def test_parse_cli_fail():
- with patch.object(sys, 'argv', ['', 'invalid_arg']):
- with pytest.raises(SystemExit):
- config = parse_cli()
-
-
-@pytest.mark.parametrize('option', cli_options)
-def test_parse_cli_to_yaml_pass(option):
- with patch.object(sys, 'argv', option):
- cli = parse_cli()
- cli_to_yaml(cli)
-
-
-# create config fixture to avoid code duplication
-
-@pytest.fixture()
-def config_dependency():
- def setup(default_path='config/settings.yaml', patch_path=None):
- """Does everything parse_config does up until loading the settings file passed in
- by the user, if they choose to pass one, to prepare the config dictionary for
- other tests to use. This fixture assumes that the tests
- test_parse_cli_* and test_parse_cli_to_yaml_* have passed.
-
- Returns the dictionary with keys 'config', 'given_yaml' and 'cli_yaml'
-
- It is ensured that config and given_yaml are valid, otherwise an exception is thrown.
- """
- # find the jobfunnel root dir
- jobfunnel_path = os.path.normpath(
- os.path.join(os.path.dirname(__file__), '../jobfunnel'))
-
- # load the default settings
- default_yaml_path = os.path.join(jobfunnel_path, default_path)
- default_yaml = yaml.safe_load(open(default_yaml_path, 'r'))
-
- # parse the command line arguments
- if patch_path == None:
- with patch.object(sys, 'argv', ['', '-s', default_yaml_path]):
- cli = parse_cli()
- else:
- with patch.object(sys, 'argv', ['', '-s', patch_path]):
- cli = parse_cli()
- cli_yaml = cli_to_yaml(cli)
-
- # parse the settings file for the line arguments
- given_yaml = None
- given_yaml_path = None
- if cli.settings is not None:
- given_yaml_path = os.path.dirname(cli.settings)
- given_yaml = yaml.safe_load(open(cli.settings, 'r'))
-
- config = default_yaml
- return {'config': config, 'given_yaml': given_yaml,
- 'cli_yaml': cli_yaml}
- return setup
-
-
-# test update_update_yaml with every command line option
-
-@pytest.mark.parametrize('option', cli_options)
-def test_update_yaml_pass(option, config_dependency):
- config_setup = config_dependency()
- with patch.object(sys, 'argv', option):
- # parse the command line arguments
- cli = parse_cli()
- cli_yaml = cli_to_yaml(cli)
-
- # parse the settings file for the line arguments
- given_yaml = None
- if cli.settings is not None:
- # take this opportunity to ensure that the demo settings file exists
- given_yaml = config_setup['given_yaml']
-
- # combine default, given and argument yamls into one. Note that we update
- # the values of the default_yaml, so we use this for the rest of the file.
- # We could make a deep copy if necessary.
- config = config_setup['config']
-
- if given_yaml is not None:
- update_yaml(config, given_yaml)
- update_yaml(config, cli_yaml)
-
-
-def test_check_config_types_fail(tmpdir, config_dependency):
- # create temporary settings file and write yaml file
- yaml_file = Path(tmpdir) / 'settings.yaml'
- with open(yaml_file, mode='w') as f:
- yaml.dump(config_dict_fail, f)
-
- # create an invalid config_dependency with data from config_dict_fail
- config_setup = config_dependency(patch_path=str(yaml_file))
- config = config_setup['config']
- given_yaml = config_setup['given_yaml']
- cli_yaml = config_setup['cli_yaml']
- if given_yaml is not None:
- update_yaml(config, given_yaml)
- update_yaml(config, cli_yaml)
- with pytest.raises(KeyError):
- check_config_types(config)
-
-
-def test_user_yaml(tmpdir):
- # create temporary settings file and write yaml file
- yaml_file = Path(tmpdir) / 'settings.yaml'
- with open(yaml_file, mode='w') as f:
- yaml.dump(config_dict, f)
-
- # call funnel with user-defined settings
- with patch.object(sys, 'argv', ['', '-s', str(yaml_file)]):
- config = parse_config()
-
- assert config['output_path'] == "fish"
- assert set(config['providers']) == set(['indeed', 'monster'])
- assert config['search_terms']['region']['state'] == 'NY'
- # assert config['search_terms']['region']['province'] == 'NY' # I believe this should pass
- assert config['search_terms']['region']['city'] == 'New York'
- assert config['search_terms']['region']['domain'] == 'com'
- assert config['search_terms']['region']['radius'] == 25
-
-
-# test the final config from parse_config with each command line option
-
-def test_cli_yaml():
- with patch.object(sys, 'argv', cli_options[1]):
- config = parse_config()
- assert config['output_path'] == '.'
- with patch.object(sys, 'argv', cli_options[2]):
- config = parse_config()
- assert config['search_terms']['keywords'] == ['java', 'python']
- with patch.object(sys, 'argv', cli_options[3]):
- config = parse_config()
- assert config['search_terms']['region']['province'] == 'ON'
- with patch.object(sys, 'argv', cli_options[4]):
- config = parse_config()
- assert config['search_terms']['region']['city'] == 'New York'
- with patch.object(sys, 'argv', cli_options[5]):
- config = parse_config()
- assert config['search_terms']['region']['domain'] == 'com'
- with patch.object(sys, 'argv', cli_options[6]):
- config = parse_config()
- assert config['delay_config']['random'] is True
- with patch.object(sys, 'argv', cli_options[7]):
- config = parse_config()
- assert config['delay_config']['converge'] is True
- with patch.object(sys, 'argv', cli_options[8]):
- config = parse_config()
- assert config['delay_config']['delay'] == 20
- with patch.object(sys, 'argv', cli_options[9]):
- config = parse_config()
- assert config['delay_config']['min_delay'] == 10
- with patch.object(sys, 'argv', cli_options[10]):
- config = parse_config()
- assert config['delay_config']['function'] == 'linear'
- with patch.object(sys, 'argv', cli_options[11]):
- config = parse_config()
- assert config['log_level'] == log_levels['info']
- with patch.object(sys, 'argv', cli_options[12]):
- config = parse_config()
- assert config['similar'] is True
- with patch.object(sys, 'argv', cli_options[13]):
- config = parse_config()
- assert config['no_scrape'] is True
- with patch.object(sys, 'argv', cli_options[14]):
- config = parse_config()
- assert config['proxy'] == {
- 'protocol': 'http',
- 'ip_address': '50.193.9.202',
- 'port': '53888'
- }
diff --git a/tests/test_tools.py b/tests/test_tools.py
deleted file mode 100644
index 30337220..00000000
--- a/tests/test_tools.py
+++ /dev/null
@@ -1,211 +0,0 @@
-import pytest
-
-from dateutil.relativedelta import relativedelta
-from datetime import datetime, timedelta
-
-from jobfunnel.tools.tools import split_url, proxy_dict_to_url, config_factory, post_date_from_relative_post_age, filter_non_printables
-
-
-URLS = [
- {
- 'url': 'https://192.168.178.20:812',
- 'splits': {
- 'protocol': 'https',
- 'ip_address': '192.168.178.20',
- 'port': '812'
- },
- 'complete': True
- },
- {
- 'url': '1.168.178.20:812',
- 'splits': {
- 'protocol': '',
- 'ip_address': '1.168.178.20',
- 'port': '812'
- },
- 'complete': False
- },
- {
- 'url': 'https://192.168.178.20',
- 'splits': {
- 'protocol': 'https',
- 'ip_address': '192.168.178.20',
- 'port': ''
- },
- 'complete': False
- },
- {
- 'url': '192.168.178.20',
- 'splits': {
- 'protocol': '',
- 'ip_address': '192.168.178.20',
- 'port': ''
- },
- 'complete': False
- }
-]
-
-# Define an attribute list for all tests to use in this module
-
-attr_list = [
- [['title'], 'Test Engineer'],
- [['title'], 'Software Engineer–'],
- [['blurb'], 'Test and develop'],
- [['blurb'], 'Develop and design software–'],
- [['date'], 'Just posted'],
- [['date'], 'today'],
- [['date'], '1 hour ago'],
- [['date'], '2 hours ago'],
- [['date'], 'yesterday'],
- [['date'], '1 day ago'],
- [['date'], '2 days ago'],
- [['date'], '1 month'],
- [['date'], '2 months'],
- [['date'], '1 year ago'],
- [['date'], '2 years ago'],
- [['date'], '1 epoch ago'],
- [['date'], 'junk'],
- [['some_option'], 'option_value']
-]
-
-# test clean/dirty characters that may be on title and blurb fields
-
-def test_filter_non_printables_clean_title(job_listings):
- job_list = job_listings(attr_list[0:1])
- filter_non_printables(job_list[0])
- assert job_list[0]['title'] == 'Test Engineer'
-
-
-def test_filter_non_printables_dirty_title(job_listings):
- job_list = job_listings(attr_list[1:2])
- filter_non_printables(job_list[0])
- assert job_list[0]['title'] == 'Software Engineer'
-
-
-def test_filter_non_printables_clean_blurb(job_listings):
- job_list = job_listings(attr_list[2:3])
- filter_non_printables(job_list[0])
- assert job_list[0]['blurb'] == 'Test and develop'
-
-
-def test_filter_non_printables_diryt_blurb(job_listings):
- job_list = job_listings(attr_list[3:4])
- filter_non_printables(job_list[0])
- assert job_list[0]['blurb'] == 'Develop and design software'
-
-# test job_listing dates with all possible formats
-
-def test_post_date_from_relative_post_age_just_posted_pass(job_listings):
- job_list = job_listings(attr_list[4:5])
- post_date_from_relative_post_age(job_list)
- assert datetime.now().strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_post_age_today_pass(job_listings):
- job_list = job_listings(attr_list[5:6])
- post_date_from_relative_post_age(job_list)
- assert datetime.now().strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_post_age_1_hour_ago_pass(job_listings):
- job_list = job_listings(attr_list[6:7])
- post_date_from_relative_post_age(job_list)
- now = datetime.now()
- assert now.strftime('%Y-%m-%d') == job_list[0]['date'] or \
- (now - timedelta(days=int(1))).strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_post_age_2_hours_ago_pass(job_listings):
- job_list = job_listings(attr_list[7:8])
- post_date_from_relative_post_age(job_list)
- now = datetime.now()
- assert now.strftime('%Y-%m-%d') == job_list[0]['date'] or \
- (now - timedelta(days=int(1))).strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_yesterday_ago_pass(job_listings):
- job_list = job_listings(attr_list[8:9])
- post_date_from_relative_post_age(job_list)
- yesterday = datetime.now() - timedelta(days=int(1))
- assert yesterday.strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_1_day_ago_pass(job_listings):
- job_list = job_listings(attr_list[9:10])
- post_date_from_relative_post_age(job_list)
- one_day_ago = datetime.now() - timedelta(days=int(1))
- assert one_day_ago.strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_2_days_ago_pass(job_listings):
- job_list = job_listings(attr_list[10:11])
- post_date_from_relative_post_age(job_list)
- two_days_ago = datetime.now() - timedelta(days=int(2))
- assert two_days_ago.strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_1_month_ago_pass(job_listings):
- job_list = job_listings(attr_list[11:12])
- post_date_from_relative_post_age(job_list)
- one_month_ago = datetime.now() - relativedelta(months=int(1))
- assert one_month_ago.strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_2_months_ago_pass(job_listings):
- job_list = job_listings(attr_list[12:13])
- post_date_from_relative_post_age(job_list)
- two_months_ago = datetime.now() - relativedelta(months=int(2))
- assert two_months_ago.strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_1_year_ago_pass(job_listings):
- job_list = job_listings(attr_list[13:14])
- post_date_from_relative_post_age(job_list)
- one_year_ago = datetime.now() - relativedelta(years=int(1))
- assert one_year_ago.strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_2_years_ago_pass(job_listings):
- job_list = job_listings(attr_list[14:15])
- post_date_from_relative_post_age(job_list)
- two_years_ago = datetime.now() - relativedelta(years=int(2))
- assert two_years_ago.strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_1_epoch_ago_pass(job_listings):
- job_list = job_listings(attr_list[15:16])
- post_date_from_relative_post_age(job_list)
- assert datetime(1970, 1, 1).strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_post_date_from_relative_ago_post_age_junk(job_listings):
- job_list = job_listings(attr_list[16:17])
- post_date_from_relative_post_age(job_list)
- assert datetime(1970, 1, 1).strftime('%Y-%m-%d') == job_list[0]['date']
-
-
-def test_config_factory(configure_options):
- config = config_factory(configure_options(
- ['']), attr_list[17:18])[0]
- assert config['some_option'] == 'option_value'
-
-
-@pytest.mark.parametrize('url', URLS)
-def test_split_url(url):
- # gives dictionary with protocol, ip and port
- url_dic = split_url(url['url'])
-
- # check if all elements match with provided output
- if url['complete']:
- assert url_dic == url['splits']
- else:
- assert url_dic is None
-
-
-@pytest.mark.parametrize('url', URLS)
-def test_proxy_dict_to_url(url):
- # gives dictionary with protocol, ip and port
- url_str = proxy_dict_to_url(url['splits'])
-
- # check if all elements match with provided output
- assert url_str == url['url']
diff --git a/tests/test_validate.py b/tests/test_validate.py
deleted file mode 100644
index 644c0f5e..00000000
--- a/tests/test_validate.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import pytest
-import sys
-
-from unittest.mock import patch
-
-from jobfunnel.config.parser import parse_config
-from jobfunnel.config.validate import validate_config, validate_delay, validate_region
-from jobfunnel.tools.tools import config_factory
-
-
-# define config dictionaries that are not valid
-# invalid path
-attr_list = [
- [['master_list_path'], 'masterzz_list.csv'],
- [['providers'], ['indeed', 'twitter']],
- [['search_terms', 'region', 'domain'], 'cjas'],
- [['search_terms', 'region', 'province'], None],
- [['delay_config', 'function'], 'weird'],
- [['delay_config', 'min_delay'], 50.0],
- [['delay_config', 'min_delay'], -1],
- [['delay_config', 'delay'], 2],
- [['max_listing_days'], -1],
- [['data_path'], 'data_dump'],
- [['duplicate_list_path'], 'duplicate_list_.csv'],
- [['log_path'], 'data/jobfunnel_.log'],
- [['filter_list_path'], 'data/filter_list_.json']
-]
-
-# test all paths with invalid values
-
-def test_filter_list_path_fail(configure_options):
- path_configs = config_factory(
- configure_options(['']), attr_list[12: 13])[0]
- with pytest.raises(Exception) as e:
- validate_config(path_configs)
- assert str(e.value) == 'filter_list_path'
-
-
-def test_log_path_fail(configure_options):
- path_configs = config_factory(configure_options(['']), attr_list[11:12])[0]
- with pytest.raises(Exception) as e:
- validate_config(path_configs)
- assert str(e.value) == 'log_path'
-
-
-def test_duplicate_list_path_fail(configure_options):
- path_configs = config_factory(
- configure_options(['']), attr_list[10: 11])[0]
- with pytest.raises(Exception) as e:
- validate_config(path_configs)
- assert str(e.value) == 'duplicate_list_path'
-
-
-def test_data_path_fail(configure_options):
- path_configs = config_factory(configure_options(['']), attr_list[9: 10])[0]
- with pytest.raises(Exception) as e:
- validate_config(path_configs)
- assert str(e.value) == 'data_path'
-
-
-def test_master_list_path_fail(configure_options):
- path_configs = config_factory(configure_options(['']), attr_list[0: 1])[0]
- with pytest.raises(Exception) as e:
- validate_config(path_configs)
- assert str(e.value) == 'master_list_path'
-
-
-# test with invalid providers
-
-def test_providers_fail(configure_options):
- providers_config = config_factory(
- configure_options(['']), attr_list[1: 2])[0]
- with pytest.raises(Exception) as e:
- validate_config(providers_config)
- assert str(e.value) == 'providers'
-
-
-# test with invalid regions and domains
-
-def test_domain_fail(configure_options):
- region_config = config_factory(configure_options(['']), attr_list[2:3])[0]
- with pytest.raises(Exception) as e:
- validate_region(region_config['search_terms']['region'])
- assert str(e.value) == 'domain'
-
-
-def test_province_fail(configure_options):
- region_config = config_factory(configure_options(['']), attr_list[3:4])[0]
- with pytest.raises(Exception) as e:
- validate_region(region_config['search_terms']['region'])
- assert str(e.value) == 'province'
-
-
-# test validate_region with the default valid Configuration
-
-def test_region_pass(configure_options):
- validate_region(configure_options([''])['search_terms']['region'])
-
-
-# generate config with invalid delay function name
-
-def test_delay_function_fail(configure_options):
- delay_configs = config_factory(configure_options(['']), attr_list[4: 5])[0]
- with pytest.raises(Exception) as e:
- validate_delay(delay_configs['delay_config'])
- assert str(e.value) == 'delay_function'
-
-
-# test delay_function with original configuration
-
-def test_delay_function_pass(configure_options):
- validate_delay(configure_options([''])['delay_config'])
-
-
-# generate config with invalid min delay value of -1
-
-def test_delay_min_delay_fail(configure_options):
- delay_configs = config_factory(configure_options(['']), attr_list[6: 7])[0]
- with pytest.raises(Exception) as e:
- validate_delay(delay_configs['delay_config'])
- assert str(e.value) == '(min)_delay'
-
-
-# test validate_delay with a min_delay greater than delay
-
-def test_delay_min_delay_greater_than_delay_fail(configure_options):
- delay_configs = config_factory(configure_options(['']), attr_list[5: 6])[0]
- with pytest.raises(Exception) as e:
- validate_delay(delay_configs['delay_config'])
- assert str(e.value) == '(min)_delay'
-
-
-# test validate_delay with a delay less than 10(the minimum)
-
-def test_delay_less_than_10_fail(configure_options):
- delay_configs = config_factory(configure_options(['']), attr_list[7: 8])[0]
- with pytest.raises(Exception) as e:
- validate_delay(delay_configs['delay_config'])
- assert str(e.value) == '(min)_delay'
-
-
-# test validate_delay with the original configuration
-
-def test_delay_pass(configure_options):
- validate_delay(configure_options([''])['delay_config'])
-
-
-# test validate_delay with a max_listing_days value of -1
-
-def test_delay_max_listing_days_fail(configure_options):
- max_listing_days_config = config_factory(
- configure_options(['']), attr_list[8: 9])[0]
- with pytest.raises(Exception) as e:
- validate_config(max_listing_days_config)
- assert str(e.value) == 'max_listing_days'
-
-
-# test the integration of all parts with the config as a whole
-
-@pytest.mark.parametrize('attribute', attr_list)
-def test_config_fail(configure_options, attribute):
- config = config_factory(configure_options(['']), [attribute])[0]
- with pytest.raises(Exception):
- validate_config(config)
-
-
-def test_config_pass(configure_options):
- validate_config(configure_options(['']))