mirror of https://github.com/Bunsly/JobSpy
format: Apply Black formatter to the codebase (#127)
parent
e8b4b376b8
commit
94d8f555fd
|
@ -0,0 +1,7 @@
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 24.2.0
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
language_version: python
|
||||||
|
args: [--line-length=88, --quiet]
|
|
@ -203,6 +203,52 @@ soupsieve = ">1.2"
|
||||||
html5lib = ["html5lib"]
|
html5lib = ["html5lib"]
|
||||||
lxml = ["lxml"]
|
lxml = ["lxml"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "black"
|
||||||
|
version = "24.2.0"
|
||||||
|
description = "The uncompromising code formatter."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "black-24.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6981eae48b3b33399c8757036c7f5d48a535b962a7c2310d19361edeef64ce29"},
|
||||||
|
{file = "black-24.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d533d5e3259720fdbc1b37444491b024003e012c5173f7d06825a77508085430"},
|
||||||
|
{file = "black-24.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61a0391772490ddfb8a693c067df1ef5227257e72b0e4108482b8d41b5aee13f"},
|
||||||
|
{file = "black-24.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:992e451b04667116680cb88f63449267c13e1ad134f30087dec8527242e9862a"},
|
||||||
|
{file = "black-24.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:163baf4ef40e6897a2a9b83890e59141cc8c2a98f2dda5080dc15c00ee1e62cd"},
|
||||||
|
{file = "black-24.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e37c99f89929af50ffaf912454b3e3b47fd64109659026b678c091a4cd450fb2"},
|
||||||
|
{file = "black-24.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9de21bafcba9683853f6c96c2d515e364aee631b178eaa5145fc1c61a3cc92"},
|
||||||
|
{file = "black-24.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:9db528bccb9e8e20c08e716b3b09c6bdd64da0dd129b11e160bf082d4642ac23"},
|
||||||
|
{file = "black-24.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d84f29eb3ee44859052073b7636533ec995bd0f64e2fb43aeceefc70090e752b"},
|
||||||
|
{file = "black-24.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e08fb9a15c914b81dd734ddd7fb10513016e5ce7e6704bdd5e1251ceee51ac9"},
|
||||||
|
{file = "black-24.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:810d445ae6069ce64030c78ff6127cd9cd178a9ac3361435708b907d8a04c693"},
|
||||||
|
{file = "black-24.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ba15742a13de85e9b8f3239c8f807723991fbfae24bad92d34a2b12e81904982"},
|
||||||
|
{file = "black-24.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7e53a8c630f71db01b28cd9602a1ada68c937cbf2c333e6ed041390d6968faf4"},
|
||||||
|
{file = "black-24.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:93601c2deb321b4bad8f95df408e3fb3943d85012dddb6121336b8e24a0d1218"},
|
||||||
|
{file = "black-24.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0057f800de6acc4407fe75bb147b0c2b5cbb7c3ed110d3e5999cd01184d53b0"},
|
||||||
|
{file = "black-24.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:faf2ee02e6612577ba0181f4347bcbcf591eb122f7841ae5ba233d12c39dcb4d"},
|
||||||
|
{file = "black-24.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:057c3dc602eaa6fdc451069bd027a1b2635028b575a6c3acfd63193ced20d9c8"},
|
||||||
|
{file = "black-24.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:08654d0797e65f2423f850fc8e16a0ce50925f9337fb4a4a176a7aa4026e63f8"},
|
||||||
|
{file = "black-24.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca610d29415ee1a30a3f30fab7a8f4144e9d34c89a235d81292a1edb2b55f540"},
|
||||||
|
{file = "black-24.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:4dd76e9468d5536abd40ffbc7a247f83b2324f0c050556d9c371c2b9a9a95e31"},
|
||||||
|
{file = "black-24.2.0-py3-none-any.whl", hash = "sha256:e8a6ae970537e67830776488bca52000eaa37fa63b9988e8c487458d9cd5ace6"},
|
||||||
|
{file = "black-24.2.0.tar.gz", hash = "sha256:bce4f25c27c3435e4dace4815bcb2008b87e167e3bf4ee47ccdc5ce906eb4894"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
click = ">=8.0.0"
|
||||||
|
mypy-extensions = ">=0.4.3"
|
||||||
|
packaging = ">=22.0"
|
||||||
|
pathspec = ">=0.9.0"
|
||||||
|
platformdirs = ">=2"
|
||||||
|
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
|
||||||
|
typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
colorama = ["colorama (>=0.4.3)"]
|
||||||
|
d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"]
|
||||||
|
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
|
||||||
|
uvloop = ["uvloop (>=0.15.2)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bleach"
|
name = "bleach"
|
||||||
version = "6.0.0"
|
version = "6.0.0"
|
||||||
|
@ -308,6 +354,17 @@ files = [
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
pycparser = "*"
|
pycparser = "*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfgv"
|
||||||
|
version = "3.4.0"
|
||||||
|
description = "Validate configuration and produce human readable error messages."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
|
||||||
|
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.2.0"
|
version = "3.2.0"
|
||||||
|
@ -392,6 +449,20 @@ files = [
|
||||||
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
|
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "click"
|
||||||
|
version = "8.1.7"
|
||||||
|
description = "Composable command line interface toolkit"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
|
||||||
|
{file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "colorama"
|
name = "colorama"
|
||||||
version = "0.4.6"
|
version = "0.4.6"
|
||||||
|
@ -471,6 +542,17 @@ files = [
|
||||||
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
|
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "distlib"
|
||||||
|
version = "0.3.8"
|
||||||
|
description = "Distribution utilities"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"},
|
||||||
|
{file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "exceptiongroup"
|
name = "exceptiongroup"
|
||||||
version = "1.1.3"
|
version = "1.1.3"
|
||||||
|
@ -513,6 +595,22 @@ files = [
|
||||||
[package.extras]
|
[package.extras]
|
||||||
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
|
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "filelock"
|
||||||
|
version = "3.13.1"
|
||||||
|
description = "A platform independent file lock."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"},
|
||||||
|
{file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"]
|
||||||
|
testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
|
||||||
|
typing = ["typing-extensions (>=4.8)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fqdn"
|
name = "fqdn"
|
||||||
version = "1.5.1"
|
version = "1.5.1"
|
||||||
|
@ -524,6 +622,20 @@ files = [
|
||||||
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
|
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "identify"
|
||||||
|
version = "2.5.35"
|
||||||
|
description = "File identification library for Python"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "identify-2.5.35-py2.py3-none-any.whl", hash = "sha256:c4de0081837b211594f8e877a6b4fad7ca32bbfc1a9307fdd61c28bfe923f13e"},
|
||||||
|
{file = "identify-2.5.35.tar.gz", hash = "sha256:10a7ca245cfcd756a554a7288159f72ff105ad233c7c4b9c6f0f4d108f5f6791"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
license = ["ukkonen"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "3.4"
|
version = "3.4"
|
||||||
|
@ -1125,6 +1237,17 @@ files = [
|
||||||
{file = "mistune-3.0.1.tar.gz", hash = "sha256:e912116c13aa0944f9dc530db38eb88f6a77087ab128f49f84a48f4c05ea163c"},
|
{file = "mistune-3.0.1.tar.gz", hash = "sha256:e912116c13aa0944f9dc530db38eb88f6a77087ab128f49f84a48f4c05ea163c"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mypy-extensions"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Type system extensions for programs checked with the mypy type checker."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.5"
|
||||||
|
files = [
|
||||||
|
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
|
||||||
|
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nbclient"
|
name = "nbclient"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
|
@ -1216,6 +1339,20 @@ files = [
|
||||||
{file = "nest_asyncio-1.5.7.tar.gz", hash = "sha256:6a80f7b98f24d9083ed24608977c09dd608d83f91cccc24c9d2cba6d10e01c10"},
|
{file = "nest_asyncio-1.5.7.tar.gz", hash = "sha256:6a80f7b98f24d9083ed24608977c09dd608d83f91cccc24c9d2cba6d10e01c10"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nodeenv"
|
||||||
|
version = "1.8.0"
|
||||||
|
description = "Node.js virtual environment builder"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*"
|
||||||
|
files = [
|
||||||
|
{file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"},
|
||||||
|
{file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
setuptools = "*"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "notebook"
|
name = "notebook"
|
||||||
version = "7.0.3"
|
version = "7.0.3"
|
||||||
|
@ -1402,6 +1539,17 @@ files = [
|
||||||
qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
|
qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
|
||||||
testing = ["docopt", "pytest (<6.0.0)"]
|
testing = ["docopt", "pytest (<6.0.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pathspec"
|
||||||
|
version = "0.12.1"
|
||||||
|
description = "Utility library for gitignore style pattern matching of file paths."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
|
||||||
|
{file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pexpect"
|
name = "pexpect"
|
||||||
version = "4.8.0"
|
version = "4.8.0"
|
||||||
|
@ -1457,6 +1605,24 @@ files = [
|
||||||
dev = ["pre-commit", "tox"]
|
dev = ["pre-commit", "tox"]
|
||||||
testing = ["pytest", "pytest-benchmark"]
|
testing = ["pytest", "pytest-benchmark"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pre-commit"
|
||||||
|
version = "3.6.2"
|
||||||
|
description = "A framework for managing and maintaining multi-language pre-commit hooks."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.9"
|
||||||
|
files = [
|
||||||
|
{file = "pre_commit-3.6.2-py2.py3-none-any.whl", hash = "sha256:ba637c2d7a670c10daedc059f5c49b5bd0aadbccfcd7ec15592cf9665117532c"},
|
||||||
|
{file = "pre_commit-3.6.2.tar.gz", hash = "sha256:c3ef34f463045c88658c5b99f38c1e297abdcc0ff13f98d3370055fbbfabc67e"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
cfgv = ">=2.0.0"
|
||||||
|
identify = ">=1.0.0"
|
||||||
|
nodeenv = ">=0.11.1"
|
||||||
|
pyyaml = ">=5.1"
|
||||||
|
virtualenv = ">=20.10.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "prometheus-client"
|
name = "prometheus-client"
|
||||||
version = "0.17.1"
|
version = "0.17.1"
|
||||||
|
@ -2183,6 +2349,22 @@ nativelib = ["pyobjc-framework-Cocoa", "pywin32"]
|
||||||
objc = ["pyobjc-framework-Cocoa"]
|
objc = ["pyobjc-framework-Cocoa"]
|
||||||
win32 = ["pywin32"]
|
win32 = ["pywin32"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "setuptools"
|
||||||
|
version = "69.1.1"
|
||||||
|
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"},
|
||||||
|
{file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
|
||||||
|
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||||
|
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "six"
|
name = "six"
|
||||||
version = "1.16.0"
|
version = "1.16.0"
|
||||||
|
@ -2383,6 +2565,26 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
|
||||||
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
|
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||||
zstd = ["zstandard (>=0.18.0)"]
|
zstd = ["zstandard (>=0.18.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "virtualenv"
|
||||||
|
version = "20.25.1"
|
||||||
|
description = "Virtual Python Environment builder"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "virtualenv-20.25.1-py3-none-any.whl", hash = "sha256:961c026ac520bac5f69acb8ea063e8a4f071bcc9457b9c1f28f6b085c511583a"},
|
||||||
|
{file = "virtualenv-20.25.1.tar.gz", hash = "sha256:e08e13ecdca7a0bd53798f356d5831434afa5b07b93f0abdf0797b7a06ffe197"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
distlib = ">=0.3.7,<1"
|
||||||
|
filelock = ">=3.12.2,<4"
|
||||||
|
platformdirs = ">=3.9.1,<5"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
|
||||||
|
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wcwidth"
|
name = "wcwidth"
|
||||||
version = "0.2.6"
|
version = "0.2.6"
|
||||||
|
@ -2450,4 +2652,4 @@ files = [
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "ba7f7cc9b6833a4a6271981f90610395639dd8b9b3db1370cbd1149d70cc9632"
|
content-hash = "6ee18819a726314f61f20f0ed93b2db2a26c232269f045146d9a8f4e3f31eb01"
|
||||||
|
|
|
@ -24,7 +24,12 @@ markdownify = "^0.11.6"
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
pytest = "^7.4.1"
|
pytest = "^7.4.1"
|
||||||
jupyter = "^1.0.0"
|
jupyter = "^1.0.0"
|
||||||
|
black = "^24.2.0"
|
||||||
|
pre-commit = "^3.6.2"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 88
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
@ -70,6 +72,7 @@ def scrape_jobs(
|
||||||
for site in site_name
|
for site in site_name
|
||||||
]
|
]
|
||||||
return site_types
|
return site_types
|
||||||
|
|
||||||
country_enum = Country.from_string(country_indeed)
|
country_enum = Country.from_string(country_indeed)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
|
@ -86,14 +89,15 @@ def scrape_jobs(
|
||||||
results_wanted=results_wanted,
|
results_wanted=results_wanted,
|
||||||
linkedin_company_ids=linkedin_company_ids,
|
linkedin_company_ids=linkedin_company_ids,
|
||||||
offset=offset,
|
offset=offset,
|
||||||
hours_old=hours_old
|
hours_old=hours_old,
|
||||||
)
|
)
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class(proxy=proxy)
|
scraper = scraper_class(proxy=proxy)
|
||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize()
|
cap_name = site.value.capitalize()
|
||||||
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||||
logger.info(f"{site_name} finished scraping")
|
logger.info(f"{site_name} finished scraping")
|
||||||
return site.value, scraped_data
|
return site.value, scraped_data
|
||||||
|
|
||||||
|
@ -117,9 +121,8 @@ def scrape_jobs(
|
||||||
for site, job_response in site_to_jobs_dict.items():
|
for site, job_response in site_to_jobs_dict.items():
|
||||||
for job in job_response.jobs:
|
for job in job_response.jobs:
|
||||||
job_data = job.dict()
|
job_data = job.dict()
|
||||||
job_data[
|
job_url = job_data["job_url"]
|
||||||
"job_url_hyper"
|
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
|
||||||
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
|
|
||||||
job_data["site"] = site
|
job_data["site"] = site
|
||||||
job_data["company"] = job_data["company_name"]
|
job_data["company"] = job_data["company_name"]
|
||||||
job_data["job_type"] = (
|
job_data["job_type"] = (
|
||||||
|
@ -156,11 +159,11 @@ def scrape_jobs(
|
||||||
|
|
||||||
if jobs_dfs:
|
if jobs_dfs:
|
||||||
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
||||||
filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]
|
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
|
||||||
|
|
||||||
# Step 2: Concatenate the filtered DataFrames
|
# Step 2: Concatenate the filtered DataFrames
|
||||||
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
||||||
|
|
||||||
# Desired column order
|
# Desired column order
|
||||||
desired_order = [
|
desired_order = [
|
||||||
"site",
|
"site",
|
||||||
|
@ -178,7 +181,6 @@ def scrape_jobs(
|
||||||
"is_remote",
|
"is_remote",
|
||||||
"emails",
|
"emails",
|
||||||
"description",
|
"description",
|
||||||
|
|
||||||
"company_url",
|
"company_url",
|
||||||
"company_url_direct",
|
"company_url_direct",
|
||||||
"company_addresses",
|
"company_addresses",
|
||||||
|
@ -191,16 +193,16 @@ def scrape_jobs(
|
||||||
"ceo_name",
|
"ceo_name",
|
||||||
"ceo_photo_url",
|
"ceo_photo_url",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
||||||
for column in desired_order:
|
for column in desired_order:
|
||||||
if column not in jobs_df.columns:
|
if column not in jobs_df.columns:
|
||||||
jobs_df[column] = None # Add missing columns as empty
|
jobs_df[column] = None # Add missing columns as empty
|
||||||
|
|
||||||
# Reorder the DataFrame according to the desired order
|
# Reorder the DataFrame according to the desired order
|
||||||
jobs_df = jobs_df[desired_order]
|
jobs_df = jobs_df[desired_order]
|
||||||
|
|
||||||
# Step 4: Sort the DataFrame as required
|
# Step 4: Sort the DataFrame as required
|
||||||
return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
|
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
|
||||||
else:
|
else:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import date
|
from datetime import date
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
@ -156,7 +158,7 @@ class Country(Enum):
|
||||||
"""Convert a string to the corresponding Country enum."""
|
"""Convert a string to the corresponding Country enum."""
|
||||||
country_str = country_str.strip().lower()
|
country_str = country_str.strip().lower()
|
||||||
for country in cls:
|
for country in cls:
|
||||||
country_names = country.value[0].split(',')
|
country_names = country.value[0].split(",")
|
||||||
if country_str in country_names:
|
if country_str in country_names:
|
||||||
return country
|
return country
|
||||||
valid_countries = [country.value for country in cls]
|
valid_countries = [country.value for country in cls]
|
||||||
|
@ -178,7 +180,10 @@ class Location(BaseModel):
|
||||||
location_parts.append(self.state)
|
location_parts.append(self.state)
|
||||||
if isinstance(self.country, str):
|
if isinstance(self.country, str):
|
||||||
location_parts.append(self.country)
|
location_parts.append(self.country)
|
||||||
elif self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
|
elif self.country and self.country not in (
|
||||||
|
Country.US_CANADA,
|
||||||
|
Country.WORLDWIDE,
|
||||||
|
):
|
||||||
country_name = self.country.value[0]
|
country_name = self.country.value[0]
|
||||||
if "," in country_name:
|
if "," in country_name:
|
||||||
country_name = country_name.split(",")[0]
|
country_name = country_name.split(",")[0]
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from ..jobs import (
|
from ..jobs import (
|
||||||
Enum,
|
Enum,
|
||||||
BaseModel,
|
BaseModel,
|
||||||
JobType,
|
JobType,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
Country,
|
Country,
|
||||||
DescriptionFormat
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,21 +4,23 @@ jobspy.scrapers.glassdoor
|
||||||
|
|
||||||
This module contains routines to scrape Glassdoor.
|
This module contains routines to scrape Glassdoor.
|
||||||
"""
|
"""
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
import requests
|
import requests
|
||||||
from typing import Optional
|
from typing import Optional, Tuple
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from ..utils import extract_emails_from_text
|
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
|
from ..utils import extract_emails_from_text
|
||||||
from ..exceptions import GlassdoorException
|
from ..exceptions import GlassdoorException
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
logger
|
logger,
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
|
@ -27,7 +29,7 @@ from ...jobs import (
|
||||||
Location,
|
Location,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
JobType,
|
JobType,
|
||||||
DescriptionFormat
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -59,25 +61,22 @@ class GlassdoorScraper(Scraper):
|
||||||
|
|
||||||
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
|
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
self.headers['gd-csrf-token'] = token if token else self.fallback_token
|
self.headers["gd-csrf-token"] = token if token else self.fallback_token
|
||||||
|
|
||||||
location_id, location_type = self._get_location(
|
location_id, location_type = self._get_location(
|
||||||
scraper_input.location, scraper_input.is_remote
|
scraper_input.location, scraper_input.is_remote
|
||||||
)
|
)
|
||||||
if location_type is None:
|
if location_type is None:
|
||||||
logger.error('Glassdoor: location not parsed')
|
logger.error("Glassdoor: location not parsed")
|
||||||
return JobResponse(jobs=[])
|
return JobResponse(jobs=[])
|
||||||
all_jobs: list[JobPost] = []
|
all_jobs: list[JobPost] = []
|
||||||
cursor = None
|
cursor = None
|
||||||
|
|
||||||
for page in range(
|
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
|
||||||
1 + (scraper_input.offset // self.jobs_per_page),
|
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
||||||
min(
|
range_end = min(tot_pages, self.max_pages + 1)
|
||||||
(scraper_input.results_wanted // self.jobs_per_page) + 2,
|
for page in range(range_start, range_end):
|
||||||
self.max_pages + 1,
|
logger.info(f"Glassdoor search page: {page}")
|
||||||
),
|
|
||||||
):
|
|
||||||
logger.info(f'Glassdoor search page: {page}')
|
|
||||||
try:
|
try:
|
||||||
jobs, cursor = self._fetch_jobs_page(
|
jobs, cursor = self._fetch_jobs_page(
|
||||||
scraper_input, location_id, location_type, page, cursor
|
scraper_input, location_id, location_type, page, cursor
|
||||||
|
@ -87,7 +86,7 @@ class GlassdoorScraper(Scraper):
|
||||||
all_jobs = all_jobs[: scraper_input.results_wanted]
|
all_jobs = all_jobs[: scraper_input.results_wanted]
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f'Glassdoor: {str(e)}')
|
logger.error(f"Glassdoor: {str(e)}")
|
||||||
break
|
break
|
||||||
return JobResponse(jobs=all_jobs)
|
return JobResponse(jobs=all_jobs)
|
||||||
|
|
||||||
|
@ -98,39 +97,48 @@ class GlassdoorScraper(Scraper):
|
||||||
location_type: str,
|
location_type: str,
|
||||||
page_num: int,
|
page_num: int,
|
||||||
cursor: str | None,
|
cursor: str | None,
|
||||||
) -> (list[JobPost], str | None):
|
) -> Tuple[list[JobPost], str | None]:
|
||||||
"""
|
"""
|
||||||
Scrapes a page of Glassdoor for jobs with scraper_input criteria
|
Scrapes a page of Glassdoor for jobs with scraper_input criteria
|
||||||
"""
|
"""
|
||||||
jobs = []
|
jobs = []
|
||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
try:
|
try:
|
||||||
payload = self._add_payload(
|
payload = self._add_payload(location_id, location_type, page_num, cursor)
|
||||||
location_id, location_type, page_num, cursor
|
|
||||||
)
|
|
||||||
response = self.session.post(
|
response = self.session.post(
|
||||||
f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload
|
f"{self.base_url}/graph",
|
||||||
|
headers=self.headers,
|
||||||
|
timeout_seconds=15,
|
||||||
|
data=payload,
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
raise GlassdoorException(f"bad response status code: {response.status_code}")
|
exc_msg = f"bad response status code: {response.status_code}"
|
||||||
|
raise GlassdoorException(exc_msg)
|
||||||
res_json = response.json()[0]
|
res_json = response.json()[0]
|
||||||
if "errors" in res_json:
|
if "errors" in res_json:
|
||||||
raise ValueError("Error encountered in API response")
|
raise ValueError("Error encountered in API response")
|
||||||
except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e:
|
except (
|
||||||
logger.error(f'Glassdoor: {str(e)}')
|
requests.exceptions.ReadTimeout,
|
||||||
|
GlassdoorException,
|
||||||
|
ValueError,
|
||||||
|
Exception,
|
||||||
|
) as e:
|
||||||
|
logger.error(f"Glassdoor: {str(e)}")
|
||||||
return jobs, None
|
return jobs, None
|
||||||
|
|
||||||
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
||||||
future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
|
future_to_job_data = {
|
||||||
|
executor.submit(self._process_job, job): job for job in jobs_data
|
||||||
|
}
|
||||||
for future in as_completed(future_to_job_data):
|
for future in as_completed(future_to_job_data):
|
||||||
try:
|
try:
|
||||||
job_post = future.result()
|
job_post = future.result()
|
||||||
if job_post:
|
if job_post:
|
||||||
jobs.append(job_post)
|
jobs.append(job_post)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
|
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
|
||||||
|
|
||||||
return jobs, self.get_cursor_for_page(
|
return jobs, self.get_cursor_for_page(
|
||||||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||||
|
@ -140,7 +148,9 @@ class GlassdoorScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
Fetches csrf token needed for API by visiting a generic page
|
Fetches csrf token needed for API by visiting a generic page
|
||||||
"""
|
"""
|
||||||
res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers)
|
res = self.session.get(
|
||||||
|
f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers
|
||||||
|
)
|
||||||
pattern = r'"token":\s*"([^"]+)"'
|
pattern = r'"token":\s*"([^"]+)"'
|
||||||
matches = re.findall(pattern, res.text)
|
matches = re.findall(pattern, res.text)
|
||||||
token = None
|
token = None
|
||||||
|
@ -153,19 +163,20 @@ class GlassdoorScraper(Scraper):
|
||||||
Processes a single job and fetches its description.
|
Processes a single job and fetches its description.
|
||||||
"""
|
"""
|
||||||
job_id = job_data["jobview"]["job"]["listingId"]
|
job_id = job_data["jobview"]["job"]["listingId"]
|
||||||
job_url = f'{self.base_url}job-listing/j?jl={job_id}'
|
job_url = f"{self.base_url}job-listing/j?jl={job_id}"
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
return None
|
return None
|
||||||
self.seen_urls.add(job_url)
|
self.seen_urls.add(job_url)
|
||||||
job = job_data["jobview"]
|
job = job_data["jobview"]
|
||||||
title = job["job"]["jobTitleText"]
|
title = job["job"]["jobTitleText"]
|
||||||
company_name = job["header"]["employerNameFromSearch"]
|
company_name = job["header"]["employerNameFromSearch"]
|
||||||
company_id = job_data['jobview']['header']['employer']['id']
|
company_id = job_data["jobview"]["header"]["employer"]["id"]
|
||||||
location_name = job["header"].get("locationName", "")
|
location_name = job["header"].get("locationName", "")
|
||||||
location_type = job["header"].get("locationType", "")
|
location_type = job["header"].get("locationType", "")
|
||||||
age_in_days = job["header"].get("ageInDays")
|
age_in_days = job["header"].get("ageInDays")
|
||||||
is_remote, location = False, None
|
is_remote, location = False, None
|
||||||
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days is not None else None
|
date_diff = (datetime.now() - timedelta(days=age_in_days)).date()
|
||||||
|
date_posted = date_diff if age_in_days is not None else None
|
||||||
|
|
||||||
if location_type == "S":
|
if location_type == "S":
|
||||||
is_remote = True
|
is_remote = True
|
||||||
|
@ -177,9 +188,10 @@ class GlassdoorScraper(Scraper):
|
||||||
description = self._fetch_job_description(job_id)
|
description = self._fetch_job_description(job_id)
|
||||||
except:
|
except:
|
||||||
description = None
|
description = None
|
||||||
|
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
|
||||||
return JobPost(
|
return JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None,
|
company_url=company_url if company_id else None,
|
||||||
company_name=company_name,
|
company_name=company_name,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
|
@ -201,7 +213,7 @@ class GlassdoorScraper(Scraper):
|
||||||
"variables": {
|
"variables": {
|
||||||
"jl": job_id,
|
"jl": job_id,
|
||||||
"queryString": "q",
|
"queryString": "q",
|
||||||
"pageTypeEnum": "SERP"
|
"pageTypeEnum": "SERP",
|
||||||
},
|
},
|
||||||
"query": """
|
"query": """
|
||||||
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
|
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
|
||||||
|
@ -216,15 +228,17 @@ class GlassdoorScraper(Scraper):
|
||||||
__typename
|
__typename
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
res = requests.post(url, json=body, headers=self.headers)
|
res = requests.post(url, json=body, headers=self.headers)
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
return None
|
return None
|
||||||
data = res.json()[0]
|
data = res.json()[0]
|
||||||
desc = data['data']['jobview']['job']['description']
|
desc = data["data"]["jobview"]["job"]["description"]
|
||||||
return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
|
desc = markdown_converter(desc)
|
||||||
|
return desc
|
||||||
|
|
||||||
def _get_location(self, location: str, is_remote: bool) -> (int, str):
|
def _get_location(self, location: str, is_remote: bool) -> (int, str):
|
||||||
if not location or is_remote:
|
if not location or is_remote:
|
||||||
|
@ -234,10 +248,13 @@ class GlassdoorScraper(Scraper):
|
||||||
res = self.session.get(url, headers=self.headers)
|
res = self.session.get(url, headers=self.headers)
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
|
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
||||||
|
logger.error(err)
|
||||||
return None, None
|
return None, None
|
||||||
else:
|
else:
|
||||||
logger.error(f'Glassdoor response status code {res.status_code}')
|
err = f"Glassdoor response status code {res.status_code}"
|
||||||
|
err += f" - {res.text}"
|
||||||
|
logger.error(f"Glassdoor response status code {res.status_code}")
|
||||||
return None, None
|
return None, None
|
||||||
items = res.json()
|
items = res.json()
|
||||||
|
|
||||||
|
@ -248,7 +265,7 @@ class GlassdoorScraper(Scraper):
|
||||||
location_type = "CITY"
|
location_type = "CITY"
|
||||||
elif location_type == "S":
|
elif location_type == "S":
|
||||||
location_type = "STATE"
|
location_type = "STATE"
|
||||||
elif location_type == 'N':
|
elif location_type == "N":
|
||||||
location_type = "COUNTRY"
|
location_type = "COUNTRY"
|
||||||
return int(items[0]["locationId"]), location_type
|
return int(items[0]["locationId"]), location_type
|
||||||
|
|
||||||
|
@ -259,7 +276,9 @@ class GlassdoorScraper(Scraper):
|
||||||
page_num: int,
|
page_num: int,
|
||||||
cursor: str | None = None,
|
cursor: str | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
|
fromage = None
|
||||||
|
if self.scraper_input.hours_old:
|
||||||
|
fromage = max(self.scraper_input.hours_old // 24, 1)
|
||||||
filter_params = []
|
filter_params = []
|
||||||
if self.scraper_input.easy_apply:
|
if self.scraper_input.easy_apply:
|
||||||
filter_params.append({"filterKey": "applicationType", "values": "1"})
|
filter_params.append({"filterKey": "applicationType", "values": "1"})
|
||||||
|
@ -278,9 +297,9 @@ class GlassdoorScraper(Scraper):
|
||||||
"pageNumber": page_num,
|
"pageNumber": page_num,
|
||||||
"pageCursor": cursor,
|
"pageCursor": cursor,
|
||||||
"fromage": fromage,
|
"fromage": fromage,
|
||||||
"sort": "date"
|
"sort": "date",
|
||||||
},
|
},
|
||||||
"query": self.query_template
|
"query": self.query_template,
|
||||||
}
|
}
|
||||||
if self.scraper_input.job_type:
|
if self.scraper_input.job_type:
|
||||||
payload["variables"]["filterParams"].append(
|
payload["variables"]["filterParams"].append(
|
||||||
|
@ -512,4 +531,4 @@ class GlassdoorScraper(Scraper):
|
||||||
}
|
}
|
||||||
__typename
|
__typename
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -4,9 +4,13 @@ jobspy.scrapers.indeed
|
||||||
|
|
||||||
This module contains routines to scrape Indeed.
|
This module contains routines to scrape Indeed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
from typing import Tuple
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
@ -15,7 +19,7 @@ from ..utils import (
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
get_enum_from_job_type,
|
get_enum_from_job_type,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
logger
|
logger,
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
|
@ -24,7 +28,7 @@ from ...jobs import (
|
||||||
Location,
|
Location,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
JobType,
|
JobType,
|
||||||
DescriptionFormat
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,30 +58,30 @@ class IndeedScraper(Scraper):
|
||||||
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
|
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
|
||||||
self.base_url = f"https://{domain}.indeed.com"
|
self.base_url = f"https://{domain}.indeed.com"
|
||||||
self.headers = self.api_headers.copy()
|
self.headers = self.api_headers.copy()
|
||||||
self.headers['indeed-co'] = self.scraper_input.country.indeed_domain_value
|
self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
|
||||||
job_list = []
|
job_list = []
|
||||||
page = 1
|
page = 1
|
||||||
|
|
||||||
cursor = None
|
cursor = None
|
||||||
offset_pages = math.ceil(self.scraper_input.offset / 100)
|
offset_pages = math.ceil(self.scraper_input.offset / 100)
|
||||||
for _ in range(offset_pages):
|
for _ in range(offset_pages):
|
||||||
logger.info(f'Indeed skipping search page: {page}')
|
logger.info(f"Indeed skipping search page: {page}")
|
||||||
__, cursor = self._scrape_page(cursor)
|
__, cursor = self._scrape_page(cursor)
|
||||||
if not __:
|
if not __:
|
||||||
logger.info(f'Indeed found no jobs on page: {page}')
|
logger.info(f"Indeed found no jobs on page: {page}")
|
||||||
break
|
break
|
||||||
|
|
||||||
while len(self.seen_urls) < scraper_input.results_wanted:
|
while len(self.seen_urls) < scraper_input.results_wanted:
|
||||||
logger.info(f'Indeed search page: {page}')
|
logger.info(f"Indeed search page: {page}")
|
||||||
jobs, cursor = self._scrape_page(cursor)
|
jobs, cursor = self._scrape_page(cursor)
|
||||||
if not jobs:
|
if not jobs:
|
||||||
logger.info(f'Indeed found no jobs on page: {page}')
|
logger.info(f"Indeed found no jobs on page: {page}")
|
||||||
break
|
break
|
||||||
job_list += jobs
|
job_list += jobs
|
||||||
page += 1
|
page += 1
|
||||||
return JobResponse(jobs=job_list[:scraper_input.results_wanted])
|
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
|
||||||
|
|
||||||
def _scrape_page(self, cursor: str | None) -> (list[JobPost], str | None):
|
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
|
||||||
"""
|
"""
|
||||||
Scrapes a page of Indeed for jobs with scraper_input criteria
|
Scrapes a page of Indeed for jobs with scraper_input criteria
|
||||||
:param cursor:
|
:param cursor:
|
||||||
|
@ -86,31 +90,43 @@ class IndeedScraper(Scraper):
|
||||||
jobs = []
|
jobs = []
|
||||||
new_cursor = None
|
new_cursor = None
|
||||||
filters = self._build_filters()
|
filters = self._build_filters()
|
||||||
|
location = (
|
||||||
|
self.scraper_input.location
|
||||||
|
or self.scraper_input.country.value[0].split(",")[-1]
|
||||||
|
)
|
||||||
query = self.job_search_query.format(
|
query = self.job_search_query.format(
|
||||||
what=self.scraper_input.search_term,
|
what=self.scraper_input.search_term,
|
||||||
location=self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1],
|
location=location,
|
||||||
radius=self.scraper_input.distance,
|
radius=self.scraper_input.distance,
|
||||||
dateOnIndeed=self.scraper_input.hours_old,
|
dateOnIndeed=self.scraper_input.hours_old,
|
||||||
cursor=f'cursor: "{cursor}"' if cursor else '',
|
cursor=f'cursor: "{cursor}"' if cursor else "",
|
||||||
filters=filters
|
filters=filters,
|
||||||
)
|
)
|
||||||
payload = {
|
payload = {
|
||||||
'query': query,
|
"query": query,
|
||||||
}
|
}
|
||||||
api_headers = self.api_headers.copy()
|
api_headers = self.api_headers.copy()
|
||||||
api_headers['indeed-co'] = self.api_country_code
|
api_headers["indeed-co"] = self.api_country_code
|
||||||
response = requests.post(self.api_url, headers=api_headers, json=payload, proxies=self.proxy, timeout=10)
|
response = requests.post(
|
||||||
|
self.api_url,
|
||||||
|
headers=api_headers,
|
||||||
|
json=payload,
|
||||||
|
proxies=self.proxy,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
logger.info(f'Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)')
|
logger.info(
|
||||||
|
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)"
|
||||||
|
)
|
||||||
return jobs, new_cursor
|
return jobs, new_cursor
|
||||||
data = response.json()
|
data = response.json()
|
||||||
jobs = data['data']['jobSearch']['results']
|
jobs = data["data"]["jobSearch"]["results"]
|
||||||
new_cursor = data['data']['jobSearch']['pageInfo']['nextCursor']
|
new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
||||||
job_results: list[Future] = [
|
job_results: list[Future] = [
|
||||||
executor.submit(self._process_job, job['job']) for job in jobs
|
executor.submit(self._process_job, job["job"]) for job in jobs
|
||||||
]
|
]
|
||||||
job_list = [result.result() for result in job_results if result.result()]
|
job_list = [result.result() for result in job_results if result.result()]
|
||||||
return job_list, new_cursor
|
return job_list, new_cursor
|
||||||
|
|
||||||
|
@ -128,7 +144,9 @@ class IndeedScraper(Scraper):
|
||||||
start: "{start}h"
|
start: "{start}h"
|
||||||
}}
|
}}
|
||||||
}}
|
}}
|
||||||
""".format(start=self.scraper_input.hours_old)
|
""".format(
|
||||||
|
start=self.scraper_input.hours_old
|
||||||
|
)
|
||||||
elif self.scraper_input.job_type or self.scraper_input.is_remote:
|
elif self.scraper_input.job_type or self.scraper_input.is_remote:
|
||||||
job_type_key_mapping = {
|
job_type_key_mapping = {
|
||||||
JobType.FULL_TIME: "CF3CP",
|
JobType.FULL_TIME: "CF3CP",
|
||||||
|
@ -171,22 +189,24 @@ class IndeedScraper(Scraper):
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
return
|
return
|
||||||
self.seen_urls.add(job_url)
|
self.seen_urls.add(job_url)
|
||||||
description = job['description']['html']
|
description = job["description"]["html"]
|
||||||
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
|
description = markdown_converter(description)
|
||||||
|
|
||||||
job_type = self._get_job_type(job['attributes'])
|
job_type = self._get_job_type(job["attributes"])
|
||||||
timestamp_seconds = job["datePublished"] / 1000
|
timestamp_seconds = job["datePublished"] / 1000
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
||||||
employer = job['employer'].get('dossier') if job['employer'] else None
|
employer = job["employer"].get("dossier") if job["employer"] else None
|
||||||
employer_details = employer.get('employerDetails', {}) if employer else {}
|
employer_details = employer.get("employerDetails", {}) if employer else {}
|
||||||
|
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
|
||||||
return JobPost(
|
return JobPost(
|
||||||
title=job["title"],
|
title=job["title"],
|
||||||
description=description,
|
description=description,
|
||||||
company_name=job['employer'].get("name") if job.get('employer') else None,
|
company_name=job["employer"].get("name") if job.get("employer") else None,
|
||||||
company_url=f"{self.base_url}{job['employer']['relativeCompanyPageUrl']}" if job[
|
company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
|
||||||
'employer'] else None,
|
company_url_direct=(
|
||||||
company_url_direct=employer['links']['corporateWebsite'] if employer else None,
|
employer["links"]["corporateWebsite"] if employer else None
|
||||||
|
),
|
||||||
location=Location(
|
location=Location(
|
||||||
city=job.get("location", {}).get("city"),
|
city=job.get("location", {}).get("city"),
|
||||||
state=job.get("location", {}).get("admin1Code"),
|
state=job.get("location", {}).get("admin1Code"),
|
||||||
|
@ -196,20 +216,39 @@ class IndeedScraper(Scraper):
|
||||||
compensation=self._get_compensation(job),
|
compensation=self._get_compensation(job),
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
job_url_direct=job['recruit'].get('viewJobUrl') if job.get('recruit') else None,
|
job_url_direct=(
|
||||||
|
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
|
||||||
|
),
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
is_remote=self._is_job_remote(job, description),
|
is_remote=self._is_job_remote(job, description),
|
||||||
|
company_addresses=(
|
||||||
company_addresses=employer_details['addresses'][0] if employer_details.get('addresses') else None,
|
employer_details["addresses"][0]
|
||||||
company_industry=employer_details['industry'].replace('Iv1', '').replace('_', ' ').title() if employer_details.get('industry') else None,
|
if employer_details.get("addresses")
|
||||||
company_num_employees=employer_details.get('employeesLocalizedLabel'),
|
else None
|
||||||
company_revenue=employer_details.get('revenueLocalizedLabel'),
|
),
|
||||||
company_description=employer_details.get('briefDescription'),
|
company_industry=(
|
||||||
ceo_name=employer_details.get('ceoName'),
|
employer_details["industry"]
|
||||||
ceo_photo_url=employer_details.get('ceoPhotoUrl'),
|
.replace("Iv1", "")
|
||||||
|
.replace("_", " ")
|
||||||
logo_photo_url=employer['images'].get('squareLogoUrl') if employer and employer.get('images') else None,
|
.title()
|
||||||
banner_photo_url=employer['images'].get('headerImageUrl') if employer and employer.get('images') else None,
|
if employer_details.get("industry")
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
company_num_employees=employer_details.get("employeesLocalizedLabel"),
|
||||||
|
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
||||||
|
company_description=employer_details.get("briefDescription"),
|
||||||
|
ceo_name=employer_details.get("ceoName"),
|
||||||
|
ceo_photo_url=employer_details.get("ceoPhotoUrl"),
|
||||||
|
logo_photo_url=(
|
||||||
|
employer["images"].get("squareLogoUrl")
|
||||||
|
if employer and employer.get("images")
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
banner_photo_url=(
|
||||||
|
employer["images"].get("headerImageUrl")
|
||||||
|
if employer and employer.get("images")
|
||||||
|
else None
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -221,7 +260,7 @@ class IndeedScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
job_types: list[JobType] = []
|
job_types: list[JobType] = []
|
||||||
for attribute in attributes:
|
for attribute in attributes:
|
||||||
job_type_str = attribute['label'].replace("-", "").replace(" ", "").lower()
|
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
|
||||||
job_type = get_enum_from_job_type(job_type_str)
|
job_type = get_enum_from_job_type(job_type_str)
|
||||||
if job_type:
|
if job_type:
|
||||||
job_types.append(job_type)
|
job_types.append(job_type)
|
||||||
|
@ -235,33 +274,41 @@ class IndeedScraper(Scraper):
|
||||||
:param job:
|
:param job:
|
||||||
:return: compensation object
|
:return: compensation object
|
||||||
"""
|
"""
|
||||||
comp = job['compensation']['baseSalary']
|
comp = job["compensation"]["baseSalary"]
|
||||||
if comp:
|
if not comp:
|
||||||
interval = IndeedScraper._get_compensation_interval(comp['unitOfWork'])
|
return None
|
||||||
if interval:
|
interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
|
||||||
return Compensation(
|
if not interval:
|
||||||
interval=interval,
|
return None
|
||||||
min_amount=round(comp['range'].get('min'), 2) if comp['range'].get('min') is not None else None,
|
min_range = comp["range"].get("min")
|
||||||
max_amount=round(comp['range'].get('max'), 2) if comp['range'].get('max') is not None else None,
|
max_range = comp["range"].get("max")
|
||||||
currency=job['compensation']['currencyCode']
|
return Compensation(
|
||||||
)
|
interval=interval,
|
||||||
|
min_amount=round(min_range, 2) if min_range is not None else None,
|
||||||
|
max_amount=round(max_range, 2) if max_range is not None else None,
|
||||||
|
currency=job["compensation"]["currencyCode"],
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _is_job_remote(job: dict, description: str) -> bool:
|
def _is_job_remote(job: dict, description: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Searches the description, location, and attributes to check if job is remote
|
Searches the description, location, and attributes to check if job is remote
|
||||||
"""
|
"""
|
||||||
remote_keywords = ['remote', 'work from home', 'wfh']
|
remote_keywords = ["remote", "work from home", "wfh"]
|
||||||
is_remote_in_attributes = any(
|
is_remote_in_attributes = any(
|
||||||
any(keyword in attr['label'].lower() for keyword in remote_keywords)
|
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
||||||
for attr in job['attributes']
|
for attr in job["attributes"]
|
||||||
|
)
|
||||||
|
is_remote_in_description = any(
|
||||||
|
keyword in description.lower() for keyword in remote_keywords
|
||||||
)
|
)
|
||||||
is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords)
|
|
||||||
is_remote_in_location = any(
|
is_remote_in_location = any(
|
||||||
keyword in job['location']['formatted']['long'].lower()
|
keyword in job["location"]["formatted"]["long"].lower()
|
||||||
for keyword in remote_keywords
|
for keyword in remote_keywords
|
||||||
)
|
)
|
||||||
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
return (
|
||||||
|
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_compensation_interval(interval: str) -> CompensationInterval:
|
def _get_compensation_interval(interval: str) -> CompensationInterval:
|
||||||
|
@ -270,7 +317,7 @@ class IndeedScraper(Scraper):
|
||||||
"YEAR": "YEARLY",
|
"YEAR": "YEARLY",
|
||||||
"HOUR": "HOURLY",
|
"HOUR": "HOURLY",
|
||||||
"WEEK": "WEEKLY",
|
"WEEK": "WEEKLY",
|
||||||
"MONTH": "MONTHLY"
|
"MONTH": "MONTHLY",
|
||||||
}
|
}
|
||||||
mapped_interval = interval_mapping.get(interval.upper(), None)
|
mapped_interval = interval_mapping.get(interval.upper(), None)
|
||||||
if mapped_interval and mapped_interval in CompensationInterval.__members__:
|
if mapped_interval and mapped_interval in CompensationInterval.__members__:
|
||||||
|
@ -279,14 +326,14 @@ class IndeedScraper(Scraper):
|
||||||
raise ValueError(f"Unsupported interval: {interval}")
|
raise ValueError(f"Unsupported interval: {interval}")
|
||||||
|
|
||||||
api_headers = {
|
api_headers = {
|
||||||
'Host': 'apis.indeed.com',
|
"Host": "apis.indeed.com",
|
||||||
'content-type': 'application/json',
|
"content-type": "application/json",
|
||||||
'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8',
|
"indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
|
||||||
'accept': 'application/json',
|
"accept": "application/json",
|
||||||
'indeed-locale': 'en-US',
|
"indeed-locale": "en-US",
|
||||||
'accept-language': 'en-US,en;q=0.9',
|
"accept-language": "en-US,en;q=0.9",
|
||||||
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1',
|
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
|
||||||
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone',
|
"indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
|
||||||
}
|
}
|
||||||
job_search_query = """
|
job_search_query = """
|
||||||
query GetJobData {{
|
query GetJobData {{
|
||||||
|
|
|
@ -4,6 +4,9 @@ jobspy.scrapers.linkedin
|
||||||
|
|
||||||
This module contains routines to scrape LinkedIn.
|
This module contains routines to scrape LinkedIn.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
@ -24,14 +27,14 @@ from ...jobs import (
|
||||||
JobType,
|
JobType,
|
||||||
Country,
|
Country,
|
||||||
Compensation,
|
Compensation,
|
||||||
DescriptionFormat
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
logger,
|
logger,
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
get_enum_from_job_type,
|
get_enum_from_job_type,
|
||||||
currency_parser,
|
currency_parser,
|
||||||
markdown_converter
|
markdown_converter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,26 +64,32 @@ class LinkedInScraper(Scraper):
|
||||||
url_lock = Lock()
|
url_lock = Lock()
|
||||||
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
||||||
seconds_old = (
|
seconds_old = (
|
||||||
scraper_input.hours_old * 3600
|
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||||
if scraper_input.hours_old
|
)
|
||||||
else None
|
continue_search = (
|
||||||
|
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
||||||
)
|
)
|
||||||
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
|
||||||
while continue_search():
|
while continue_search():
|
||||||
logger.info(f'LinkedIn search page: {page // 25 + 1}')
|
logger.info(f"LinkedIn search page: {page // 25 + 1}")
|
||||||
session = create_session(is_tls=False, has_retry=True, delay=5)
|
session = create_session(is_tls=False, has_retry=True, delay=5)
|
||||||
params = {
|
params = {
|
||||||
"keywords": scraper_input.search_term,
|
"keywords": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
"distance": scraper_input.distance,
|
"distance": scraper_input.distance,
|
||||||
"f_WT": 2 if scraper_input.is_remote else None,
|
"f_WT": 2 if scraper_input.is_remote else None,
|
||||||
"f_JT": self.job_type_code(scraper_input.job_type)
|
"f_JT": (
|
||||||
if scraper_input.job_type
|
self.job_type_code(scraper_input.job_type)
|
||||||
else None,
|
if scraper_input.job_type
|
||||||
|
else None
|
||||||
|
),
|
||||||
"pageNum": 0,
|
"pageNum": 0,
|
||||||
"start": page + scraper_input.offset,
|
"start": page + scraper_input.offset,
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None,
|
"f_C": (
|
||||||
|
",".join(map(str, scraper_input.linkedin_company_ids))
|
||||||
|
if scraper_input.linkedin_company_ids
|
||||||
|
else None
|
||||||
|
),
|
||||||
}
|
}
|
||||||
if seconds_old is not None:
|
if seconds_old is not None:
|
||||||
params["f_TPR"] = f"r{seconds_old}"
|
params["f_TPR"] = f"r{seconds_old}"
|
||||||
|
@ -97,15 +106,19 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
if response.status_code not in range(200, 400):
|
if response.status_code not in range(200, 400):
|
||||||
if response.status_code == 429:
|
if response.status_code == 429:
|
||||||
logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
|
err = (
|
||||||
|
f"429 Response - Blocked by LinkedIn for too many requests"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.error(f'LinkedIn response status code {response.status_code}')
|
err = f"LinkedIn response status code {response.status_code}"
|
||||||
|
err += f" - {response.text}"
|
||||||
|
logger.error(err)
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "Proxy responded with" in str(e):
|
if "Proxy responded with" in str(e):
|
||||||
logger.error(f'LinkedIn: Bad proxy')
|
logger.error(f"LinkedIn: Bad proxy")
|
||||||
else:
|
else:
|
||||||
logger.error(f'LinkedIn: {str(e)}')
|
logger.error(f"LinkedIn: {str(e)}")
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
@ -126,11 +139,12 @@ class LinkedInScraper(Scraper):
|
||||||
continue
|
continue
|
||||||
seen_urls.add(job_url)
|
seen_urls.add(job_url)
|
||||||
try:
|
try:
|
||||||
job_post = self._process_job(job_card, job_url, scraper_input.linkedin_fetch_description)
|
fetch_desc = scraper_input.linkedin_fetch_description
|
||||||
|
job_post = self._process_job(job_card, job_url, fetch_desc)
|
||||||
if job_post:
|
if job_post:
|
||||||
job_list.append(job_post)
|
job_list.append(job_post)
|
||||||
if not continue_search():
|
if not continue_search():
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise LinkedInException(str(e))
|
raise LinkedInException(str(e))
|
||||||
|
|
||||||
|
@ -141,8 +155,10 @@ class LinkedInScraper(Scraper):
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def _process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
|
def _process_job(
|
||||||
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
|
self, job_card: Tag, job_url: str, full_descr: bool
|
||||||
|
) -> Optional[JobPost]:
|
||||||
|
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
|
||||||
|
|
||||||
compensation = None
|
compensation = None
|
||||||
if salary_tag:
|
if salary_tag:
|
||||||
|
@ -212,7 +228,9 @@ class LinkedInScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
session = create_session(is_tls=False, has_retry=True)
|
session = create_session(is_tls=False, has_retry=True)
|
||||||
response = session.get(job_page_url, headers=self.headers, timeout=5, proxies=self.proxy)
|
response = session.get(
|
||||||
|
job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
|
||||||
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except:
|
except:
|
||||||
return None, None
|
return None, None
|
||||||
|
@ -225,10 +243,12 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
description = None
|
description = None
|
||||||
if div_content is not None:
|
if div_content is not None:
|
||||||
|
|
||||||
def remove_attributes(tag):
|
def remove_attributes(tag):
|
||||||
for attr in list(tag.attrs):
|
for attr in list(tag.attrs):
|
||||||
del tag[attr]
|
del tag[attr]
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
div_content = remove_attributes(div_content)
|
div_content = remove_attributes(div_content)
|
||||||
description = div_content.prettify(formatter="html")
|
description = div_content.prettify(formatter="html")
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
|
@ -257,11 +277,8 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
elif len(parts) == 3:
|
elif len(parts) == 3:
|
||||||
city, state, country = parts
|
city, state, country = parts
|
||||||
location = Location(
|
country = Country.from_string(country)
|
||||||
city=city,
|
location = Location(city=city, state=state, country=country)
|
||||||
state=state,
|
|
||||||
country=Country.from_string(country)
|
|
||||||
)
|
|
||||||
return location
|
return location
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import logging
|
from __future__ import annotations
|
||||||
import re
|
|
||||||
|
|
||||||
import numpy as np
|
import re
|
||||||
|
import logging
|
||||||
import requests
|
import requests
|
||||||
import tls_client
|
import tls_client
|
||||||
|
import numpy as np
|
||||||
from markdownify import markdownify as md
|
from markdownify import markdownify as md
|
||||||
from requests.adapters import HTTPAdapter, Retry
|
from requests.adapters import HTTPAdapter, Retry
|
||||||
|
|
||||||
|
@ -14,7 +15,8 @@ logger.propagate = False
|
||||||
if not logger.handlers:
|
if not logger.handlers:
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
console_handler = logging.StreamHandler()
|
console_handler = logging.StreamHandler()
|
||||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
formatter = logging.Formatter(format)
|
||||||
console_handler.setFormatter(formatter)
|
console_handler.setFormatter(formatter)
|
||||||
logger.addHandler(console_handler)
|
logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
@ -33,7 +35,12 @@ def extract_emails_from_text(text: str) -> list[str] | None:
|
||||||
return email_regex.findall(text)
|
return email_regex.findall(text)
|
||||||
|
|
||||||
|
|
||||||
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
|
def create_session(
|
||||||
|
proxy: dict | None = None,
|
||||||
|
is_tls: bool = True,
|
||||||
|
has_retry: bool = False,
|
||||||
|
delay: int = 1,
|
||||||
|
) -> requests.Session:
|
||||||
"""
|
"""
|
||||||
Creates a requests session with optional tls, proxy, and retry settings.
|
Creates a requests session with optional tls, proxy, and retry settings.
|
||||||
:return: A session object
|
:return: A session object
|
||||||
|
@ -47,15 +54,17 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
|
||||||
if proxy:
|
if proxy:
|
||||||
session.proxies.update(proxy)
|
session.proxies.update(proxy)
|
||||||
if has_retry:
|
if has_retry:
|
||||||
retries = Retry(total=3,
|
retries = Retry(
|
||||||
connect=3,
|
total=3,
|
||||||
status=3,
|
connect=3,
|
||||||
status_forcelist=[500, 502, 503, 504, 429],
|
status=3,
|
||||||
backoff_factor=delay)
|
status_forcelist=[500, 502, 503, 504, 429],
|
||||||
|
backoff_factor=delay,
|
||||||
|
)
|
||||||
adapter = HTTPAdapter(max_retries=retries)
|
adapter = HTTPAdapter(max_retries=retries)
|
||||||
|
|
||||||
session.mount('http://', adapter)
|
session.mount("http://", adapter)
|
||||||
session.mount('https://', adapter)
|
session.mount("https://", adapter)
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,17 +82,15 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
||||||
def currency_parser(cur_str):
|
def currency_parser(cur_str):
|
||||||
# Remove any non-numerical characters
|
# Remove any non-numerical characters
|
||||||
# except for ',' '.' or '-' (e.g. EUR)
|
# except for ',' '.' or '-' (e.g. EUR)
|
||||||
cur_str = re.sub("[^-0-9.,]", '', cur_str)
|
cur_str = re.sub("[^-0-9.,]", "", cur_str)
|
||||||
# Remove any 000s separators (either , or .)
|
# Remove any 000s separators (either , or .)
|
||||||
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
|
cur_str = re.sub("[.,]", "", cur_str[:-3]) + cur_str[-3:]
|
||||||
|
|
||||||
if '.' in list(cur_str[-3:]):
|
if "." in list(cur_str[-3:]):
|
||||||
num = float(cur_str)
|
num = float(cur_str)
|
||||||
elif ',' in list(cur_str[-3:]):
|
elif "," in list(cur_str[-3:]):
|
||||||
num = float(cur_str.replace(',', '.'))
|
num = float(cur_str.replace(",", "."))
|
||||||
else:
|
else:
|
||||||
num = float(cur_str)
|
num = float(cur_str)
|
||||||
|
|
||||||
return np.round(num, 2)
|
return np.round(num, 2)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,9 @@ jobspy.scrapers.ziprecruiter
|
||||||
|
|
||||||
This module contains routines to scrape ZipRecruiter.
|
This module contains routines to scrape ZipRecruiter.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -16,7 +19,7 @@ from ..utils import (
|
||||||
logger,
|
logger,
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter
|
markdown_converter,
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
|
@ -25,7 +28,7 @@ from ...jobs import (
|
||||||
JobResponse,
|
JobResponse,
|
||||||
JobType,
|
JobType,
|
||||||
Country,
|
Country,
|
||||||
DescriptionFormat
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -62,7 +65,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
break
|
break
|
||||||
if page > 1:
|
if page > 1:
|
||||||
time.sleep(self.delay)
|
time.sleep(self.delay)
|
||||||
logger.info(f'ZipRecruiter search page: {page}')
|
logger.info(f"ZipRecruiter search page: {page}")
|
||||||
jobs_on_page, continue_token = self._find_jobs_in_page(
|
jobs_on_page, continue_token = self._find_jobs_in_page(
|
||||||
scraper_input, continue_token
|
scraper_input, continue_token
|
||||||
)
|
)
|
||||||
|
@ -88,25 +91,24 @@ class ZipRecruiterScraper(Scraper):
|
||||||
if continue_token:
|
if continue_token:
|
||||||
params["continue_from"] = continue_token
|
params["continue_from"] = continue_token
|
||||||
try:
|
try:
|
||||||
res= self.session.get(
|
res = self.session.get(
|
||||||
f"{self.api_url}/jobs-app/jobs",
|
f"{self.api_url}/jobs-app/jobs", headers=self.headers, params=params
|
||||||
headers=self.headers,
|
|
||||||
params=params
|
|
||||||
)
|
)
|
||||||
if res.status_code not in range(200, 400):
|
if res.status_code not in range(200, 400):
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
logger.error(f'429 Response - Blocked by ZipRecruiter for too many requests')
|
err = "429 Response - Blocked by ZipRecruiter for too many requests"
|
||||||
else:
|
else:
|
||||||
logger.error(f'ZipRecruiter response status code {res.status_code}')
|
err = f"ZipRecruiter response status code {res.status_code}"
|
||||||
|
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
|
||||||
|
logger.error(err)
|
||||||
return jobs_list, ""
|
return jobs_list, ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "Proxy responded with" in str(e):
|
if "Proxy responded with" in str(e):
|
||||||
logger.error(f'Indeed: Bad proxy')
|
logger.error(f"Indeed: Bad proxy")
|
||||||
else:
|
else:
|
||||||
logger.error(f'Indeed: {str(e)}')
|
logger.error(f"Indeed: {str(e)}")
|
||||||
return jobs_list, ""
|
return jobs_list, ""
|
||||||
|
|
||||||
|
|
||||||
res_data = res.json()
|
res_data = res.json()
|
||||||
jobs_list = res_data.get("jobs", [])
|
jobs_list = res_data.get("jobs", [])
|
||||||
next_continue_token = res_data.get("continue", None)
|
next_continue_token = res_data.get("continue", None)
|
||||||
|
@ -127,7 +129,11 @@ class ZipRecruiterScraper(Scraper):
|
||||||
self.seen_urls.add(job_url)
|
self.seen_urls.add(job_url)
|
||||||
|
|
||||||
description = job.get("job_description", "").strip()
|
description = job.get("job_description", "").strip()
|
||||||
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
|
description = (
|
||||||
|
markdown_converter(description)
|
||||||
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
|
||||||
|
else description
|
||||||
|
)
|
||||||
company = job.get("hiring_company", {}).get("name")
|
company = job.get("hiring_company", {}).get("name")
|
||||||
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
||||||
country_enum = Country.from_string(country_value)
|
country_enum = Country.from_string(country_value)
|
||||||
|
@ -138,23 +144,22 @@ class ZipRecruiterScraper(Scraper):
|
||||||
job_type = self._get_job_type_enum(
|
job_type = self._get_job_type_enum(
|
||||||
job.get("employment_type", "").replace("_", "").lower()
|
job.get("employment_type", "").replace("_", "").lower()
|
||||||
)
|
)
|
||||||
date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date()
|
date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
|
||||||
|
comp_interval = job.get("compensation_interval")
|
||||||
|
comp_interval = "yearly" if comp_interval == "annual" else comp_interval
|
||||||
|
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
|
||||||
|
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
|
||||||
|
comp_currency = job.get("compensation_currency")
|
||||||
return JobPost(
|
return JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
company_name=company,
|
company_name=company,
|
||||||
location=location,
|
location=location,
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=Compensation(
|
compensation=Compensation(
|
||||||
interval="yearly"
|
interval=comp_interval,
|
||||||
if job.get("compensation_interval") == "annual"
|
min_amount=comp_min,
|
||||||
else job.get("compensation_interval"),
|
max_amount=comp_max,
|
||||||
min_amount=int(job["compensation_min"])
|
currency=comp_currency,
|
||||||
if "compensation_min" in job
|
|
||||||
else None,
|
|
||||||
max_amount=int(job["compensation_max"])
|
|
||||||
if "compensation_max" in job
|
|
||||||
else None,
|
|
||||||
currency=job.get("compensation_currency"),
|
|
||||||
),
|
),
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
|
@ -163,8 +168,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_cookies(self):
|
def _get_cookies(self):
|
||||||
data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
||||||
self.session.post(f"{self.api_url}/jobs-app/event", data=data, headers=self.headers)
|
url = f"{self.api_url}/jobs-app/event"
|
||||||
|
self.session.post(url, data=data, headers=self.headers)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||||
|
@ -180,16 +186,13 @@ class ZipRecruiterScraper(Scraper):
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
}
|
}
|
||||||
if scraper_input.hours_old:
|
if scraper_input.hours_old:
|
||||||
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
|
params["days"] = max(scraper_input.hours_old // 24, 1)
|
||||||
params['days'] = fromage
|
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
|
||||||
job_type_map = {
|
|
||||||
JobType.FULL_TIME: 'full_time',
|
|
||||||
JobType.PART_TIME: 'part_time'
|
|
||||||
}
|
|
||||||
if scraper_input.job_type:
|
if scraper_input.job_type:
|
||||||
params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0]
|
job_type = scraper_input.job_type
|
||||||
|
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
|
||||||
if scraper_input.easy_apply:
|
if scraper_input.easy_apply:
|
||||||
params['zipapply'] = 1
|
params["zipapply"] = 1
|
||||||
if scraper_input.is_remote:
|
if scraper_input.is_remote:
|
||||||
params["remote"] = 1
|
params["remote"] = 1
|
||||||
if scraper_input.distance:
|
if scraper_input.distance:
|
||||||
|
|
Loading…
Reference in New Issue