format: Apply Black formatter to the codebase (#127)

pull/128/head
VitaminB16 2024-03-11 04:36:27 +00:00 committed by GitHub
parent e8b4b376b8
commit 94d8f555fd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 524 additions and 208 deletions

7
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,7 @@
repos:
- repo: https://github.com/psf/black
rev: 24.2.0
hooks:
- id: black
language_version: python
args: [--line-length=88, --quiet]

204
poetry.lock generated
View File

@ -203,6 +203,52 @@ soupsieve = ">1.2"
html5lib = ["html5lib"] html5lib = ["html5lib"]
lxml = ["lxml"] lxml = ["lxml"]
[[package]]
name = "black"
version = "24.2.0"
description = "The uncompromising code formatter."
optional = false
python-versions = ">=3.8"
files = [
{file = "black-24.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6981eae48b3b33399c8757036c7f5d48a535b962a7c2310d19361edeef64ce29"},
{file = "black-24.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d533d5e3259720fdbc1b37444491b024003e012c5173f7d06825a77508085430"},
{file = "black-24.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61a0391772490ddfb8a693c067df1ef5227257e72b0e4108482b8d41b5aee13f"},
{file = "black-24.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:992e451b04667116680cb88f63449267c13e1ad134f30087dec8527242e9862a"},
{file = "black-24.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:163baf4ef40e6897a2a9b83890e59141cc8c2a98f2dda5080dc15c00ee1e62cd"},
{file = "black-24.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e37c99f89929af50ffaf912454b3e3b47fd64109659026b678c091a4cd450fb2"},
{file = "black-24.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9de21bafcba9683853f6c96c2d515e364aee631b178eaa5145fc1c61a3cc92"},
{file = "black-24.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:9db528bccb9e8e20c08e716b3b09c6bdd64da0dd129b11e160bf082d4642ac23"},
{file = "black-24.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d84f29eb3ee44859052073b7636533ec995bd0f64e2fb43aeceefc70090e752b"},
{file = "black-24.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e08fb9a15c914b81dd734ddd7fb10513016e5ce7e6704bdd5e1251ceee51ac9"},
{file = "black-24.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:810d445ae6069ce64030c78ff6127cd9cd178a9ac3361435708b907d8a04c693"},
{file = "black-24.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ba15742a13de85e9b8f3239c8f807723991fbfae24bad92d34a2b12e81904982"},
{file = "black-24.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7e53a8c630f71db01b28cd9602a1ada68c937cbf2c333e6ed041390d6968faf4"},
{file = "black-24.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:93601c2deb321b4bad8f95df408e3fb3943d85012dddb6121336b8e24a0d1218"},
{file = "black-24.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0057f800de6acc4407fe75bb147b0c2b5cbb7c3ed110d3e5999cd01184d53b0"},
{file = "black-24.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:faf2ee02e6612577ba0181f4347bcbcf591eb122f7841ae5ba233d12c39dcb4d"},
{file = "black-24.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:057c3dc602eaa6fdc451069bd027a1b2635028b575a6c3acfd63193ced20d9c8"},
{file = "black-24.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:08654d0797e65f2423f850fc8e16a0ce50925f9337fb4a4a176a7aa4026e63f8"},
{file = "black-24.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca610d29415ee1a30a3f30fab7a8f4144e9d34c89a235d81292a1edb2b55f540"},
{file = "black-24.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:4dd76e9468d5536abd40ffbc7a247f83b2324f0c050556d9c371c2b9a9a95e31"},
{file = "black-24.2.0-py3-none-any.whl", hash = "sha256:e8a6ae970537e67830776488bca52000eaa37fa63b9988e8c487458d9cd5ace6"},
{file = "black-24.2.0.tar.gz", hash = "sha256:bce4f25c27c3435e4dace4815bcb2008b87e167e3bf4ee47ccdc5ce906eb4894"},
]
[package.dependencies]
click = ">=8.0.0"
mypy-extensions = ">=0.4.3"
packaging = ">=22.0"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]] [[package]]
name = "bleach" name = "bleach"
version = "6.0.0" version = "6.0.0"
@ -308,6 +354,17 @@ files = [
[package.dependencies] [package.dependencies]
pycparser = "*" pycparser = "*"
[[package]]
name = "cfgv"
version = "3.4.0"
description = "Validate configuration and produce human readable error messages."
optional = false
python-versions = ">=3.8"
files = [
{file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
]
[[package]] [[package]]
name = "charset-normalizer" name = "charset-normalizer"
version = "3.2.0" version = "3.2.0"
@ -392,6 +449,20 @@ files = [
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
] ]
[[package]]
name = "click"
version = "8.1.7"
description = "Composable command line interface toolkit"
optional = false
python-versions = ">=3.7"
files = [
{file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
{file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]] [[package]]
name = "colorama" name = "colorama"
version = "0.4.6" version = "0.4.6"
@ -471,6 +542,17 @@ files = [
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
] ]
[[package]]
name = "distlib"
version = "0.3.8"
description = "Distribution utilities"
optional = false
python-versions = "*"
files = [
{file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"},
{file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
]
[[package]] [[package]]
name = "exceptiongroup" name = "exceptiongroup"
version = "1.1.3" version = "1.1.3"
@ -513,6 +595,22 @@ files = [
[package.extras] [package.extras]
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
[[package]]
name = "filelock"
version = "3.13.1"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.8"
files = [
{file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"},
{file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"},
]
[package.extras]
docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"]
testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
typing = ["typing-extensions (>=4.8)"]
[[package]] [[package]]
name = "fqdn" name = "fqdn"
version = "1.5.1" version = "1.5.1"
@ -524,6 +622,20 @@ files = [
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
] ]
[[package]]
name = "identify"
version = "2.5.35"
description = "File identification library for Python"
optional = false
python-versions = ">=3.8"
files = [
{file = "identify-2.5.35-py2.py3-none-any.whl", hash = "sha256:c4de0081837b211594f8e877a6b4fad7ca32bbfc1a9307fdd61c28bfe923f13e"},
{file = "identify-2.5.35.tar.gz", hash = "sha256:10a7ca245cfcd756a554a7288159f72ff105ad233c7c4b9c6f0f4d108f5f6791"},
]
[package.extras]
license = ["ukkonen"]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.4" version = "3.4"
@ -1125,6 +1237,17 @@ files = [
{file = "mistune-3.0.1.tar.gz", hash = "sha256:e912116c13aa0944f9dc530db38eb88f6a77087ab128f49f84a48f4c05ea163c"}, {file = "mistune-3.0.1.tar.gz", hash = "sha256:e912116c13aa0944f9dc530db38eb88f6a77087ab128f49f84a48f4c05ea163c"},
] ]
[[package]]
name = "mypy-extensions"
version = "1.0.0"
description = "Type system extensions for programs checked with the mypy type checker."
optional = false
python-versions = ">=3.5"
files = [
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
]
[[package]] [[package]]
name = "nbclient" name = "nbclient"
version = "0.8.0" version = "0.8.0"
@ -1216,6 +1339,20 @@ files = [
{file = "nest_asyncio-1.5.7.tar.gz", hash = "sha256:6a80f7b98f24d9083ed24608977c09dd608d83f91cccc24c9d2cba6d10e01c10"}, {file = "nest_asyncio-1.5.7.tar.gz", hash = "sha256:6a80f7b98f24d9083ed24608977c09dd608d83f91cccc24c9d2cba6d10e01c10"},
] ]
[[package]]
name = "nodeenv"
version = "1.8.0"
description = "Node.js virtual environment builder"
optional = false
python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*"
files = [
{file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"},
{file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"},
]
[package.dependencies]
setuptools = "*"
[[package]] [[package]]
name = "notebook" name = "notebook"
version = "7.0.3" version = "7.0.3"
@ -1402,6 +1539,17 @@ files = [
qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
testing = ["docopt", "pytest (<6.0.0)"] testing = ["docopt", "pytest (<6.0.0)"]
[[package]]
name = "pathspec"
version = "0.12.1"
description = "Utility library for gitignore style pattern matching of file paths."
optional = false
python-versions = ">=3.8"
files = [
{file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
{file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
]
[[package]] [[package]]
name = "pexpect" name = "pexpect"
version = "4.8.0" version = "4.8.0"
@ -1457,6 +1605,24 @@ files = [
dev = ["pre-commit", "tox"] dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"] testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pre-commit"
version = "3.6.2"
description = "A framework for managing and maintaining multi-language pre-commit hooks."
optional = false
python-versions = ">=3.9"
files = [
{file = "pre_commit-3.6.2-py2.py3-none-any.whl", hash = "sha256:ba637c2d7a670c10daedc059f5c49b5bd0aadbccfcd7ec15592cf9665117532c"},
{file = "pre_commit-3.6.2.tar.gz", hash = "sha256:c3ef34f463045c88658c5b99f38c1e297abdcc0ff13f98d3370055fbbfabc67e"},
]
[package.dependencies]
cfgv = ">=2.0.0"
identify = ">=1.0.0"
nodeenv = ">=0.11.1"
pyyaml = ">=5.1"
virtualenv = ">=20.10.0"
[[package]] [[package]]
name = "prometheus-client" name = "prometheus-client"
version = "0.17.1" version = "0.17.1"
@ -2183,6 +2349,22 @@ nativelib = ["pyobjc-framework-Cocoa", "pywin32"]
objc = ["pyobjc-framework-Cocoa"] objc = ["pyobjc-framework-Cocoa"]
win32 = ["pywin32"] win32 = ["pywin32"]
[[package]]
name = "setuptools"
version = "69.1.1"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.8"
files = [
{file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"},
{file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"},
]
[package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
[[package]] [[package]]
name = "six" name = "six"
version = "1.16.0" version = "1.16.0"
@ -2383,6 +2565,26 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
zstd = ["zstandard (>=0.18.0)"] zstd = ["zstandard (>=0.18.0)"]
[[package]]
name = "virtualenv"
version = "20.25.1"
description = "Virtual Python Environment builder"
optional = false
python-versions = ">=3.7"
files = [
{file = "virtualenv-20.25.1-py3-none-any.whl", hash = "sha256:961c026ac520bac5f69acb8ea063e8a4f071bcc9457b9c1f28f6b085c511583a"},
{file = "virtualenv-20.25.1.tar.gz", hash = "sha256:e08e13ecdca7a0bd53798f356d5831434afa5b07b93f0abdf0797b7a06ffe197"},
]
[package.dependencies]
distlib = ">=0.3.7,<1"
filelock = ">=3.12.2,<4"
platformdirs = ">=3.9.1,<5"
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
[[package]] [[package]]
name = "wcwidth" name = "wcwidth"
version = "0.2.6" version = "0.2.6"
@ -2450,4 +2652,4 @@ files = [
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "ba7f7cc9b6833a4a6271981f90610395639dd8b9b3db1370cbd1149d70cc9632" content-hash = "6ee18819a726314f61f20f0ed93b2db2a26c232269f045146d9a8f4e3f31eb01"

View File

@ -24,7 +24,12 @@ markdownify = "^0.11.6"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.4.1" pytest = "^7.4.1"
jupyter = "^1.0.0" jupyter = "^1.0.0"
black = "^24.2.0"
pre-commit = "^3.6.2"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 88

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import pandas as pd import pandas as pd
from typing import Tuple from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -70,6 +72,7 @@ def scrape_jobs(
for site in site_name for site in site_name
] ]
return site_types return site_types
country_enum = Country.from_string(country_indeed) country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput( scraper_input = ScraperInput(
@ -86,14 +89,15 @@ def scrape_jobs(
results_wanted=results_wanted, results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids, linkedin_company_ids=linkedin_company_ids,
offset=offset, offset=offset,
hours_old=hours_old hours_old=hours_old,
) )
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy) scraper = scraper_class(proxy=proxy)
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
logger.info(f"{site_name} finished scraping") logger.info(f"{site_name} finished scraping")
return site.value, scraped_data return site.value, scraped_data
@ -117,9 +121,8 @@ def scrape_jobs(
for site, job_response in site_to_jobs_dict.items(): for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs: for job in job_response.jobs:
job_data = job.dict() job_data = job.dict()
job_data[ job_url = job_data["job_url"]
"job_url_hyper" job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
job_data["site"] = site job_data["site"] = site
job_data["company"] = job_data["company_name"] job_data["company"] = job_data["company_name"]
job_data["job_type"] = ( job_data["job_type"] = (
@ -156,7 +159,7 @@ def scrape_jobs(
if jobs_dfs: if jobs_dfs:
# Step 1: Filter out all-NA columns from each DataFrame before concatenation # Step 1: Filter out all-NA columns from each DataFrame before concatenation
filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs] filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
# Step 2: Concatenate the filtered DataFrames # Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True) jobs_df = pd.concat(filtered_dfs, ignore_index=True)
@ -178,7 +181,6 @@ def scrape_jobs(
"is_remote", "is_remote",
"emails", "emails",
"description", "description",
"company_url", "company_url",
"company_url_direct", "company_url_direct",
"company_addresses", "company_addresses",
@ -201,6 +203,6 @@ def scrape_jobs(
jobs_df = jobs_df[desired_order] jobs_df = jobs_df[desired_order]
# Step 4: Sort the DataFrame as required # Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False]) return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
else: else:
return pd.DataFrame() return pd.DataFrame()

View File

@ -1,3 +1,5 @@
from __future__ import annotations
from typing import Optional from typing import Optional
from datetime import date from datetime import date
from enum import Enum from enum import Enum
@ -156,7 +158,7 @@ class Country(Enum):
"""Convert a string to the corresponding Country enum.""" """Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower() country_str = country_str.strip().lower()
for country in cls: for country in cls:
country_names = country.value[0].split(',') country_names = country.value[0].split(",")
if country_str in country_names: if country_str in country_names:
return country return country
valid_countries = [country.value for country in cls] valid_countries = [country.value for country in cls]
@ -178,7 +180,10 @@ class Location(BaseModel):
location_parts.append(self.state) location_parts.append(self.state)
if isinstance(self.country, str): if isinstance(self.country, str):
location_parts.append(self.country) location_parts.append(self.country)
elif self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE): elif self.country and self.country not in (
Country.US_CANADA,
Country.WORLDWIDE,
):
country_name = self.country.value[0] country_name = self.country.value[0]
if "," in country_name: if "," in country_name:
country_name = country_name.split(",")[0] country_name = country_name.split(",")[0]

View File

@ -1,10 +1,12 @@
from __future__ import annotations
from ..jobs import ( from ..jobs import (
Enum, Enum,
BaseModel, BaseModel,
JobType, JobType,
JobResponse, JobResponse,
Country, Country,
DescriptionFormat DescriptionFormat,
) )

View File

@ -4,21 +4,23 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
""" """
import json
import re
from __future__ import annotations
import re
import json
import requests import requests
from typing import Optional from typing import Optional, Tuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from ..utils import extract_emails_from_text
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import extract_emails_from_text
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
from ..utils import ( from ..utils import (
create_session, create_session,
markdown_converter, markdown_converter,
logger logger,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -27,7 +29,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat DescriptionFormat,
) )
@ -59,25 +61,22 @@ class GlassdoorScraper(Scraper):
self.session = create_session(self.proxy, is_tls=True, has_retry=True) self.session = create_session(self.proxy, is_tls=True, has_retry=True)
token = self._get_csrf_token() token = self._get_csrf_token()
self.headers['gd-csrf-token'] = token if token else self.fallback_token self.headers["gd-csrf-token"] = token if token else self.fallback_token
location_id, location_type = self._get_location( location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote scraper_input.location, scraper_input.is_remote
) )
if location_type is None: if location_type is None:
logger.error('Glassdoor: location not parsed') logger.error("Glassdoor: location not parsed")
return JobResponse(jobs=[]) return JobResponse(jobs=[])
all_jobs: list[JobPost] = [] all_jobs: list[JobPost] = []
cursor = None cursor = None
for page in range( range_start = 1 + (scraper_input.offset // self.jobs_per_page)
1 + (scraper_input.offset // self.jobs_per_page), tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
min( range_end = min(tot_pages, self.max_pages + 1)
(scraper_input.results_wanted // self.jobs_per_page) + 2, for page in range(range_start, range_end):
self.max_pages + 1, logger.info(f"Glassdoor search page: {page}")
),
):
logger.info(f'Glassdoor search page: {page}')
try: try:
jobs, cursor = self._fetch_jobs_page( jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor scraper_input, location_id, location_type, page, cursor
@ -87,7 +86,7 @@ class GlassdoorScraper(Scraper):
all_jobs = all_jobs[: scraper_input.results_wanted] all_jobs = all_jobs[: scraper_input.results_wanted]
break break
except Exception as e: except Exception as e:
logger.error(f'Glassdoor: {str(e)}') logger.error(f"Glassdoor: {str(e)}")
break break
return JobResponse(jobs=all_jobs) return JobResponse(jobs=all_jobs)
@ -98,39 +97,48 @@ class GlassdoorScraper(Scraper):
location_type: str, location_type: str,
page_num: int, page_num: int,
cursor: str | None, cursor: str | None,
) -> (list[JobPost], str | None): ) -> Tuple[list[JobPost], str | None]:
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
""" """
jobs = [] jobs = []
self.scraper_input = scraper_input self.scraper_input = scraper_input
try: try:
payload = self._add_payload( payload = self._add_payload(location_id, location_type, page_num, cursor)
location_id, location_type, page_num, cursor
)
response = self.session.post( response = self.session.post(
f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload f"{self.base_url}/graph",
headers=self.headers,
timeout_seconds=15,
data=payload,
) )
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException(f"bad response status code: {response.status_code}") exc_msg = f"bad response status code: {response.status_code}"
raise GlassdoorException(exc_msg)
res_json = response.json()[0] res_json = response.json()[0]
if "errors" in res_json: if "errors" in res_json:
raise ValueError("Error encountered in API response") raise ValueError("Error encountered in API response")
except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e: except (
logger.error(f'Glassdoor: {str(e)}') requests.exceptions.ReadTimeout,
GlassdoorException,
ValueError,
Exception,
) as e:
logger.error(f"Glassdoor: {str(e)}")
return jobs, None return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"] jobs_data = res_json["data"]["jobListings"]["jobListings"]
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data} future_to_job_data = {
executor.submit(self._process_job, job): job for job in jobs_data
}
for future in as_completed(future_to_job_data): for future in as_completed(future_to_job_data):
try: try:
job_post = future.result() job_post = future.result()
if job_post: if job_post:
jobs.append(job_post) jobs.append(job_post)
except Exception as exc: except Exception as exc:
raise GlassdoorException(f'Glassdoor generated an exception: {exc}') raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
return jobs, self.get_cursor_for_page( return jobs, self.get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
@ -140,7 +148,9 @@ class GlassdoorScraper(Scraper):
""" """
Fetches csrf token needed for API by visiting a generic page Fetches csrf token needed for API by visiting a generic page
""" """
res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers) res = self.session.get(
f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers
)
pattern = r'"token":\s*"([^"]+)"' pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text) matches = re.findall(pattern, res.text)
token = None token = None
@ -153,19 +163,20 @@ class GlassdoorScraper(Scraper):
Processes a single job and fetches its description. Processes a single job and fetches its description.
""" """
job_id = job_data["jobview"]["job"]["listingId"] job_id = job_data["jobview"]["job"]["listingId"]
job_url = f'{self.base_url}job-listing/j?jl={job_id}' job_url = f"{self.base_url}job-listing/j?jl={job_id}"
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
job = job_data["jobview"] job = job_data["jobview"]
title = job["job"]["jobTitleText"] title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"] company_name = job["header"]["employerNameFromSearch"]
company_id = job_data['jobview']['header']['employer']['id'] company_id = job_data["jobview"]["header"]["employer"]["id"]
location_name = job["header"].get("locationName", "") location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "") location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays") age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days is not None else None date_diff = (datetime.now() - timedelta(days=age_in_days)).date()
date_posted = date_diff if age_in_days is not None else None
if location_type == "S": if location_type == "S":
is_remote = True is_remote = True
@ -177,9 +188,10 @@ class GlassdoorScraper(Scraper):
description = self._fetch_job_description(job_id) description = self._fetch_job_description(job_id)
except: except:
description = None description = None
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
return JobPost( return JobPost(
title=title, title=title,
company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None, company_url=company_url if company_id else None,
company_name=company_name, company_name=company_name,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
@ -201,7 +213,7 @@ class GlassdoorScraper(Scraper):
"variables": { "variables": {
"jl": job_id, "jl": job_id,
"queryString": "q", "queryString": "q",
"pageTypeEnum": "SERP" "pageTypeEnum": "SERP",
}, },
"query": """ "query": """
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) { query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
@ -216,15 +228,17 @@ class GlassdoorScraper(Scraper):
__typename __typename
} }
} }
""" """,
} }
] ]
res = requests.post(url, json=body, headers=self.headers) res = requests.post(url, json=body, headers=self.headers)
if res.status_code != 200: if res.status_code != 200:
return None return None
data = res.json()[0] data = res.json()[0]
desc = data['data']['jobview']['job']['description'] desc = data["data"]["jobview"]["job"]["description"]
return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
desc = markdown_converter(desc)
return desc
def _get_location(self, location: str, is_remote: bool) -> (int, str): def _get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote: if not location or is_remote:
@ -234,10 +248,13 @@ class GlassdoorScraper(Scraper):
res = self.session.get(url, headers=self.headers) res = self.session.get(url, headers=self.headers)
if res.status_code != 200: if res.status_code != 200:
if res.status_code == 429: if res.status_code == 429:
logger.error(f'429 Response - Blocked by Glassdoor for too many requests') err = f"429 Response - Blocked by Glassdoor for too many requests"
logger.error(err)
return None, None return None, None
else: else:
logger.error(f'Glassdoor response status code {res.status_code}') err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}"
logger.error(f"Glassdoor response status code {res.status_code}")
return None, None return None, None
items = res.json() items = res.json()
@ -248,7 +265,7 @@ class GlassdoorScraper(Scraper):
location_type = "CITY" location_type = "CITY"
elif location_type == "S": elif location_type == "S":
location_type = "STATE" location_type = "STATE"
elif location_type == 'N': elif location_type == "N":
location_type = "COUNTRY" location_type = "COUNTRY"
return int(items[0]["locationId"]), location_type return int(items[0]["locationId"]), location_type
@ -259,7 +276,9 @@ class GlassdoorScraper(Scraper):
page_num: int, page_num: int,
cursor: str | None = None, cursor: str | None = None,
) -> str: ) -> str:
fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None fromage = None
if self.scraper_input.hours_old:
fromage = max(self.scraper_input.hours_old // 24, 1)
filter_params = [] filter_params = []
if self.scraper_input.easy_apply: if self.scraper_input.easy_apply:
filter_params.append({"filterKey": "applicationType", "values": "1"}) filter_params.append({"filterKey": "applicationType", "values": "1"})
@ -278,9 +297,9 @@ class GlassdoorScraper(Scraper):
"pageNumber": page_num, "pageNumber": page_num,
"pageCursor": cursor, "pageCursor": cursor,
"fromage": fromage, "fromage": fromage,
"sort": "date" "sort": "date",
}, },
"query": self.query_template "query": self.query_template,
} }
if self.scraper_input.job_type: if self.scraper_input.job_type:
payload["variables"]["filterParams"].append( payload["variables"]["filterParams"].append(

View File

@ -4,9 +4,13 @@ jobspy.scrapers.indeed
This module contains routines to scrape Indeed. This module contains routines to scrape Indeed.
""" """
from __future__ import annotations
import math import math
from concurrent.futures import ThreadPoolExecutor, Future from typing import Tuple
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, Future
import requests import requests
@ -15,7 +19,7 @@ from ..utils import (
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type, get_enum_from_job_type,
markdown_converter, markdown_converter,
logger logger,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -24,7 +28,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat DescriptionFormat,
) )
@ -54,30 +58,30 @@ class IndeedScraper(Scraper):
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com" self.base_url = f"https://{domain}.indeed.com"
self.headers = self.api_headers.copy() self.headers = self.api_headers.copy()
self.headers['indeed-co'] = self.scraper_input.country.indeed_domain_value self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
job_list = [] job_list = []
page = 1 page = 1
cursor = None cursor = None
offset_pages = math.ceil(self.scraper_input.offset / 100) offset_pages = math.ceil(self.scraper_input.offset / 100)
for _ in range(offset_pages): for _ in range(offset_pages):
logger.info(f'Indeed skipping search page: {page}') logger.info(f"Indeed skipping search page: {page}")
__, cursor = self._scrape_page(cursor) __, cursor = self._scrape_page(cursor)
if not __: if not __:
logger.info(f'Indeed found no jobs on page: {page}') logger.info(f"Indeed found no jobs on page: {page}")
break break
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted:
logger.info(f'Indeed search page: {page}') logger.info(f"Indeed search page: {page}")
jobs, cursor = self._scrape_page(cursor) jobs, cursor = self._scrape_page(cursor)
if not jobs: if not jobs:
logger.info(f'Indeed found no jobs on page: {page}') logger.info(f"Indeed found no jobs on page: {page}")
break break
job_list += jobs job_list += jobs
page += 1 page += 1
return JobResponse(jobs=job_list[:scraper_input.results_wanted]) return JobResponse(jobs=job_list[: scraper_input.results_wanted])
def _scrape_page(self, cursor: str | None) -> (list[JobPost], str | None): def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
""" """
Scrapes a page of Indeed for jobs with scraper_input criteria Scrapes a page of Indeed for jobs with scraper_input criteria
:param cursor: :param cursor:
@ -86,31 +90,43 @@ class IndeedScraper(Scraper):
jobs = [] jobs = []
new_cursor = None new_cursor = None
filters = self._build_filters() filters = self._build_filters()
location = (
self.scraper_input.location
or self.scraper_input.country.value[0].split(",")[-1]
)
query = self.job_search_query.format( query = self.job_search_query.format(
what=self.scraper_input.search_term, what=self.scraper_input.search_term,
location=self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1], location=location,
radius=self.scraper_input.distance, radius=self.scraper_input.distance,
dateOnIndeed=self.scraper_input.hours_old, dateOnIndeed=self.scraper_input.hours_old,
cursor=f'cursor: "{cursor}"' if cursor else '', cursor=f'cursor: "{cursor}"' if cursor else "",
filters=filters filters=filters,
) )
payload = { payload = {
'query': query, "query": query,
} }
api_headers = self.api_headers.copy() api_headers = self.api_headers.copy()
api_headers['indeed-co'] = self.api_country_code api_headers["indeed-co"] = self.api_country_code
response = requests.post(self.api_url, headers=api_headers, json=payload, proxies=self.proxy, timeout=10) response = requests.post(
self.api_url,
headers=api_headers,
json=payload,
proxies=self.proxy,
timeout=10,
)
if response.status_code != 200: if response.status_code != 200:
logger.info(f'Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)') logger.info(
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)"
)
return jobs, new_cursor return jobs, new_cursor
data = response.json() data = response.json()
jobs = data['data']['jobSearch']['results'] jobs = data["data"]["jobSearch"]["results"]
new_cursor = data['data']['jobSearch']['pageInfo']['nextCursor'] new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
with ThreadPoolExecutor(max_workers=self.num_workers) as executor: with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(self._process_job, job['job']) for job in jobs executor.submit(self._process_job, job["job"]) for job in jobs
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
return job_list, new_cursor return job_list, new_cursor
@ -128,7 +144,9 @@ class IndeedScraper(Scraper):
start: "{start}h" start: "{start}h"
}} }}
}} }}
""".format(start=self.scraper_input.hours_old) """.format(
start=self.scraper_input.hours_old
)
elif self.scraper_input.job_type or self.scraper_input.is_remote: elif self.scraper_input.job_type or self.scraper_input.is_remote:
job_type_key_mapping = { job_type_key_mapping = {
JobType.FULL_TIME: "CF3CP", JobType.FULL_TIME: "CF3CP",
@ -171,22 +189,24 @@ class IndeedScraper(Scraper):
if job_url in self.seen_urls: if job_url in self.seen_urls:
return return
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
description = job['description']['html'] description = job["description"]["html"]
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
job_type = self._get_job_type(job['attributes']) job_type = self._get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000 timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d") date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
employer = job['employer'].get('dossier') if job['employer'] else None employer = job["employer"].get("dossier") if job["employer"] else None
employer_details = employer.get('employerDetails', {}) if employer else {} employer_details = employer.get("employerDetails", {}) if employer else {}
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
return JobPost( return JobPost(
title=job["title"], title=job["title"],
description=description, description=description,
company_name=job['employer'].get("name") if job.get('employer') else None, company_name=job["employer"].get("name") if job.get("employer") else None,
company_url=f"{self.base_url}{job['employer']['relativeCompanyPageUrl']}" if job[ company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
'employer'] else None, company_url_direct=(
company_url_direct=employer['links']['corporateWebsite'] if employer else None, employer["links"]["corporateWebsite"] if employer else None
),
location=Location( location=Location(
city=job.get("location", {}).get("city"), city=job.get("location", {}).get("city"),
state=job.get("location", {}).get("admin1Code"), state=job.get("location", {}).get("admin1Code"),
@ -196,20 +216,39 @@ class IndeedScraper(Scraper):
compensation=self._get_compensation(job), compensation=self._get_compensation(job),
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_url_direct=job['recruit'].get('viewJobUrl') if job.get('recruit') else None, job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
),
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
is_remote=self._is_job_remote(job, description), is_remote=self._is_job_remote(job, description),
company_addresses=(
company_addresses=employer_details['addresses'][0] if employer_details.get('addresses') else None, employer_details["addresses"][0]
company_industry=employer_details['industry'].replace('Iv1', '').replace('_', ' ').title() if employer_details.get('industry') else None, if employer_details.get("addresses")
company_num_employees=employer_details.get('employeesLocalizedLabel'), else None
company_revenue=employer_details.get('revenueLocalizedLabel'), ),
company_description=employer_details.get('briefDescription'), company_industry=(
ceo_name=employer_details.get('ceoName'), employer_details["industry"]
ceo_photo_url=employer_details.get('ceoPhotoUrl'), .replace("Iv1", "")
.replace("_", " ")
logo_photo_url=employer['images'].get('squareLogoUrl') if employer and employer.get('images') else None, .title()
banner_photo_url=employer['images'].get('headerImageUrl') if employer and employer.get('images') else None, if employer_details.get("industry")
else None
),
company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"),
ceo_name=employer_details.get("ceoName"),
ceo_photo_url=employer_details.get("ceoPhotoUrl"),
logo_photo_url=(
employer["images"].get("squareLogoUrl")
if employer and employer.get("images")
else None
),
banner_photo_url=(
employer["images"].get("headerImageUrl")
if employer and employer.get("images")
else None
),
) )
@staticmethod @staticmethod
@ -221,7 +260,7 @@ class IndeedScraper(Scraper):
""" """
job_types: list[JobType] = [] job_types: list[JobType] = []
for attribute in attributes: for attribute in attributes:
job_type_str = attribute['label'].replace("-", "").replace(" ", "").lower() job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str) job_type = get_enum_from_job_type(job_type_str)
if job_type: if job_type:
job_types.append(job_type) job_types.append(job_type)
@ -235,33 +274,41 @@ class IndeedScraper(Scraper):
:param job: :param job:
:return: compensation object :return: compensation object
""" """
comp = job['compensation']['baseSalary'] comp = job["compensation"]["baseSalary"]
if comp: if not comp:
interval = IndeedScraper._get_compensation_interval(comp['unitOfWork']) return None
if interval: interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
return Compensation( if not interval:
interval=interval, return None
min_amount=round(comp['range'].get('min'), 2) if comp['range'].get('min') is not None else None, min_range = comp["range"].get("min")
max_amount=round(comp['range'].get('max'), 2) if comp['range'].get('max') is not None else None, max_range = comp["range"].get("max")
currency=job['compensation']['currencyCode'] return Compensation(
) interval=interval,
min_amount=round(min_range, 2) if min_range is not None else None,
max_amount=round(max_range, 2) if max_range is not None else None,
currency=job["compensation"]["currencyCode"],
)
@staticmethod @staticmethod
def _is_job_remote(job: dict, description: str) -> bool: def _is_job_remote(job: dict, description: str) -> bool:
""" """
Searches the description, location, and attributes to check if job is remote Searches the description, location, and attributes to check if job is remote
""" """
remote_keywords = ['remote', 'work from home', 'wfh'] remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any( is_remote_in_attributes = any(
any(keyword in attr['label'].lower() for keyword in remote_keywords) any(keyword in attr["label"].lower() for keyword in remote_keywords)
for attr in job['attributes'] for attr in job["attributes"]
)
is_remote_in_description = any(
keyword in description.lower() for keyword in remote_keywords
) )
is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords)
is_remote_in_location = any( is_remote_in_location = any(
keyword in job['location']['formatted']['long'].lower() keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords for keyword in remote_keywords
) )
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location return (
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
)
@staticmethod @staticmethod
def _get_compensation_interval(interval: str) -> CompensationInterval: def _get_compensation_interval(interval: str) -> CompensationInterval:
@ -270,7 +317,7 @@ class IndeedScraper(Scraper):
"YEAR": "YEARLY", "YEAR": "YEARLY",
"HOUR": "HOURLY", "HOUR": "HOURLY",
"WEEK": "WEEKLY", "WEEK": "WEEKLY",
"MONTH": "MONTHLY" "MONTH": "MONTHLY",
} }
mapped_interval = interval_mapping.get(interval.upper(), None) mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__: if mapped_interval and mapped_interval in CompensationInterval.__members__:
@ -279,14 +326,14 @@ class IndeedScraper(Scraper):
raise ValueError(f"Unsupported interval: {interval}") raise ValueError(f"Unsupported interval: {interval}")
api_headers = { api_headers = {
'Host': 'apis.indeed.com', "Host": "apis.indeed.com",
'content-type': 'application/json', "content-type": "application/json",
'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8', "indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
'accept': 'application/json', "accept": "application/json",
'indeed-locale': 'en-US', "indeed-locale": "en-US",
'accept-language': 'en-US,en;q=0.9', "accept-language": "en-US,en;q=0.9",
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1', "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone', "indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
} }
job_search_query = """ job_search_query = """
query GetJobData {{ query GetJobData {{

View File

@ -4,6 +4,9 @@ jobspy.scrapers.linkedin
This module contains routines to scrape LinkedIn. This module contains routines to scrape LinkedIn.
""" """
from __future__ import annotations
import time import time
import random import random
from typing import Optional from typing import Optional
@ -24,14 +27,14 @@ from ...jobs import (
JobType, JobType,
Country, Country,
Compensation, Compensation,
DescriptionFormat DescriptionFormat,
) )
from ..utils import ( from ..utils import (
logger, logger,
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type, get_enum_from_job_type,
currency_parser, currency_parser,
markdown_converter markdown_converter,
) )
@ -61,26 +64,32 @@ class LinkedInScraper(Scraper):
url_lock = Lock() url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0 page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
seconds_old = ( seconds_old = (
scraper_input.hours_old * 3600 scraper_input.hours_old * 3600 if scraper_input.hours_old else None
if scraper_input.hours_old )
else None continue_search = (
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
) )
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
while continue_search(): while continue_search():
logger.info(f'LinkedIn search page: {page // 25 + 1}') logger.info(f"LinkedIn search page: {page // 25 + 1}")
session = create_session(is_tls=False, has_retry=True, delay=5) session = create_session(is_tls=False, has_retry=True, delay=5)
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
"distance": scraper_input.distance, "distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None, "f_WT": 2 if scraper_input.is_remote else None,
"f_JT": self.job_type_code(scraper_input.job_type) "f_JT": (
if scraper_input.job_type self.job_type_code(scraper_input.job_type)
else None, if scraper_input.job_type
else None
),
"pageNum": 0, "pageNum": 0,
"start": page + scraper_input.offset, "start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None, "f_C": (
",".join(map(str, scraper_input.linkedin_company_ids))
if scraper_input.linkedin_company_ids
else None
),
} }
if seconds_old is not None: if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}" params["f_TPR"] = f"r{seconds_old}"
@ -97,15 +106,19 @@ class LinkedInScraper(Scraper):
) )
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
if response.status_code == 429: if response.status_code == 429:
logger.error(f'429 Response - Blocked by LinkedIn for too many requests') err = (
f"429 Response - Blocked by LinkedIn for too many requests"
)
else: else:
logger.error(f'LinkedIn response status code {response.status_code}') err = f"LinkedIn response status code {response.status_code}"
err += f" - {response.text}"
logger.error(err)
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
except Exception as e: except Exception as e:
if "Proxy responded with" in str(e): if "Proxy responded with" in str(e):
logger.error(f'LinkedIn: Bad proxy') logger.error(f"LinkedIn: Bad proxy")
else: else:
logger.error(f'LinkedIn: {str(e)}') logger.error(f"LinkedIn: {str(e)}")
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
@ -126,11 +139,12 @@ class LinkedInScraper(Scraper):
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)
try: try:
job_post = self._process_job(job_card, job_url, scraper_input.linkedin_fetch_description) fetch_desc = scraper_input.linkedin_fetch_description
job_post = self._process_job(job_card, job_url, fetch_desc)
if job_post: if job_post:
job_list.append(job_post) job_list.append(job_post)
if not continue_search(): if not continue_search():
break break
except Exception as e: except Exception as e:
raise LinkedInException(str(e)) raise LinkedInException(str(e))
@ -141,8 +155,10 @@ class LinkedInScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def _process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]: def _process_job(
salary_tag = job_card.find('span', class_='job-search-card__salary-info') self, job_card: Tag, job_url: str, full_descr: bool
) -> Optional[JobPost]:
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
compensation = None compensation = None
if salary_tag: if salary_tag:
@ -212,7 +228,9 @@ class LinkedInScraper(Scraper):
""" """
try: try:
session = create_session(is_tls=False, has_retry=True) session = create_session(is_tls=False, has_retry=True)
response = session.get(job_page_url, headers=self.headers, timeout=5, proxies=self.proxy) response = session.get(
job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
)
response.raise_for_status() response.raise_for_status()
except: except:
return None, None return None, None
@ -225,10 +243,12 @@ class LinkedInScraper(Scraper):
) )
description = None description = None
if div_content is not None: if div_content is not None:
def remove_attributes(tag): def remove_attributes(tag):
for attr in list(tag.attrs): for attr in list(tag.attrs):
del tag[attr] del tag[attr]
return tag return tag
div_content = remove_attributes(div_content) div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html") description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
@ -257,11 +277,8 @@ class LinkedInScraper(Scraper):
) )
elif len(parts) == 3: elif len(parts) == 3:
city, state, country = parts city, state, country = parts
location = Location( country = Country.from_string(country)
city=city, location = Location(city=city, state=state, country=country)
state=state,
country=Country.from_string(country)
)
return location return location
@staticmethod @staticmethod

View File

@ -1,9 +1,10 @@
import logging from __future__ import annotations
import re
import numpy as np import re
import logging
import requests import requests
import tls_client import tls_client
import numpy as np
from markdownify import markdownify as md from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
@ -14,7 +15,8 @@ logger.propagate = False
if not logger.handlers: if not logger.handlers:
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
formatter = logging.Formatter(format)
console_handler.setFormatter(formatter) console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
@ -33,7 +35,12 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text) return email_regex.findall(text)
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session: def create_session(
proxy: dict | None = None,
is_tls: bool = True,
has_retry: bool = False,
delay: int = 1,
) -> requests.Session:
""" """
Creates a requests session with optional tls, proxy, and retry settings. Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object :return: A session object
@ -47,15 +54,17 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
if proxy: if proxy:
session.proxies.update(proxy) session.proxies.update(proxy)
if has_retry: if has_retry:
retries = Retry(total=3, retries = Retry(
connect=3, total=3,
status=3, connect=3,
status_forcelist=[500, 502, 503, 504, 429], status=3,
backoff_factor=delay) status_forcelist=[500, 502, 503, 504, 429],
backoff_factor=delay,
)
adapter = HTTPAdapter(max_retries=retries) adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter) session.mount("http://", adapter)
session.mount('https://', adapter) session.mount("https://", adapter)
return session return session
@ -73,17 +82,15 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
def currency_parser(cur_str): def currency_parser(cur_str):
# Remove any non-numerical characters # Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR) # except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str) cur_str = re.sub("[^-0-9.,]", "", cur_str)
# Remove any 000s separators (either , or .) # Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:] cur_str = re.sub("[.,]", "", cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]): if "." in list(cur_str[-3:]):
num = float(cur_str) num = float(cur_str)
elif ',' in list(cur_str[-3:]): elif "," in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.')) num = float(cur_str.replace(",", "."))
else: else:
num = float(cur_str) num = float(cur_str)
return np.round(num, 2) return np.round(num, 2)

View File

@ -4,6 +4,9 @@ jobspy.scrapers.ziprecruiter
This module contains routines to scrape ZipRecruiter. This module contains routines to scrape ZipRecruiter.
""" """
from __future__ import annotations
import math import math
import time import time
from datetime import datetime from datetime import datetime
@ -16,7 +19,7 @@ from ..utils import (
logger, logger,
extract_emails_from_text, extract_emails_from_text,
create_session, create_session,
markdown_converter markdown_converter,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -25,7 +28,7 @@ from ...jobs import (
JobResponse, JobResponse,
JobType, JobType,
Country, Country,
DescriptionFormat DescriptionFormat,
) )
@ -62,7 +65,7 @@ class ZipRecruiterScraper(Scraper):
break break
if page > 1: if page > 1:
time.sleep(self.delay) time.sleep(self.delay)
logger.info(f'ZipRecruiter search page: {page}') logger.info(f"ZipRecruiter search page: {page}")
jobs_on_page, continue_token = self._find_jobs_in_page( jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token scraper_input, continue_token
) )
@ -88,25 +91,24 @@ class ZipRecruiterScraper(Scraper):
if continue_token: if continue_token:
params["continue_from"] = continue_token params["continue_from"] = continue_token
try: try:
res= self.session.get( res = self.session.get(
f"{self.api_url}/jobs-app/jobs", f"{self.api_url}/jobs-app/jobs", headers=self.headers, params=params
headers=self.headers,
params=params
) )
if res.status_code not in range(200, 400): if res.status_code not in range(200, 400):
if res.status_code == 429: if res.status_code == 429:
logger.error(f'429 Response - Blocked by ZipRecruiter for too many requests') err = "429 Response - Blocked by ZipRecruiter for too many requests"
else: else:
logger.error(f'ZipRecruiter response status code {res.status_code}') err = f"ZipRecruiter response status code {res.status_code}"
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
logger.error(err)
return jobs_list, "" return jobs_list, ""
except Exception as e: except Exception as e:
if "Proxy responded with" in str(e): if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy') logger.error(f"Indeed: Bad proxy")
else: else:
logger.error(f'Indeed: {str(e)}') logger.error(f"Indeed: {str(e)}")
return jobs_list, "" return jobs_list, ""
res_data = res.json() res_data = res.json()
jobs_list = res_data.get("jobs", []) jobs_list = res_data.get("jobs", [])
next_continue_token = res_data.get("continue", None) next_continue_token = res_data.get("continue", None)
@ -127,7 +129,11 @@ class ZipRecruiterScraper(Scraper):
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
description = job.get("job_description", "").strip() description = job.get("job_description", "").strip()
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description description = (
markdown_converter(description)
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
else description
)
company = job.get("hiring_company", {}).get("name") company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada" country_value = "usa" if job.get("job_country") == "US" else "canada"
country_enum = Country.from_string(country_value) country_enum = Country.from_string(country_value)
@ -138,23 +144,22 @@ class ZipRecruiterScraper(Scraper):
job_type = self._get_job_type_enum( job_type = self._get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower() job.get("employment_type", "").replace("_", "").lower()
) )
date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date() date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
comp_interval = job.get("compensation_interval")
comp_interval = "yearly" if comp_interval == "annual" else comp_interval
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
comp_currency = job.get("compensation_currency")
return JobPost( return JobPost(
title=title, title=title,
company_name=company, company_name=company,
location=location, location=location,
job_type=job_type, job_type=job_type,
compensation=Compensation( compensation=Compensation(
interval="yearly" interval=comp_interval,
if job.get("compensation_interval") == "annual" min_amount=comp_min,
else job.get("compensation_interval"), max_amount=comp_max,
min_amount=int(job["compensation_min"]) currency=comp_currency,
if "compensation_min" in job
else None,
max_amount=int(job["compensation_max"])
if "compensation_max" in job
else None,
currency=job.get("compensation_currency"),
), ),
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
@ -163,8 +168,9 @@ class ZipRecruiterScraper(Scraper):
) )
def _get_cookies(self): def _get_cookies(self):
data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
self.session.post(f"{self.api_url}/jobs-app/event", data=data, headers=self.headers) url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data, headers=self.headers)
@staticmethod @staticmethod
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None: def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
@ -180,16 +186,13 @@ class ZipRecruiterScraper(Scraper):
"location": scraper_input.location, "location": scraper_input.location,
} }
if scraper_input.hours_old: if scraper_input.hours_old:
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None params["days"] = max(scraper_input.hours_old // 24, 1)
params['days'] = fromage job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
job_type_map = {
JobType.FULL_TIME: 'full_time',
JobType.PART_TIME: 'part_time'
}
if scraper_input.job_type: if scraper_input.job_type:
params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0] job_type = scraper_input.job_type
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
if scraper_input.easy_apply: if scraper_input.easy_apply:
params['zipapply'] = 1 params["zipapply"] = 1
if scraper_input.is_remote: if scraper_input.is_remote:
params["remote"] = 1 params["remote"] = 1
if scraper_input.distance: if scraper_input.distance: