Compare commits

...

9 Commits

Author SHA1 Message Date
Cullen Watson
80213f28d2 chore: version 2024-03-11 09:43:12 -05:00
Cullen Watson
ada38532c3 fix: indeed empty location term 2024-03-11 09:42:43 -05:00
Cullen Watson
3b0017964c fix: indeed empty search term 2024-03-11 09:21:11 -05:00
VitaminB16
94d8f555fd format: Apply Black formatter to the codebase (#127) 2024-03-10 23:36:27 -05:00
Cullen Watson
e8b4b376b8 docs: readme 2024-03-09 13:40:34 -06:00
Cullen Watson
54ac1bad16 docs: readme 2024-03-09 01:49:05 -06:00
Cullen Watson
0a669e9ba8 enh: indeed more fields (#126) 2024-03-09 01:40:01 -06:00
gigaSec
a4f6851c32 Fix GlassDoor Country Vietnam(#122) 2024-03-04 17:35:57 -06:00
troy-conte
db01bc6bbb log search updates, fix glassdoor (#120) 2024-03-04 16:39:38 -06:00
12 changed files with 869 additions and 582 deletions

7
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,7 @@
repos:
- repo: https://github.com/psf/black
rev: 24.2.0
hooks:
- id: black
language_version: python
args: [--line-length=88, --quiet]

View File

@@ -21,7 +21,7 @@ Updated for release v1.1.3
### Installation
```
pip install python-jobspy
pip install -U python-jobspy
```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
@@ -37,7 +37,7 @@ jobs = scrape_jobs(
search_term="software engineer",
location="Dallas, TX",
results_wanted=20,
hours_old=72, # (only linkedin is hour specific, others round up to days old)
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA' # only needed for indeed / glassdoor
)
print(f"Found {len(jobs)} jobs")
@@ -64,19 +64,19 @@ Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
└── search_term (str)
Optional
├── location (int)
├── distance (int): in miles
├── location (str)
├── distance (int): in miles, default 50
├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port'
├── is_remote (bool)
├── linkedin_fetch_description (bool): fetches full description for LinkedIn (slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on the job board site
├── easy_apply (bool): filters for jobs that are hosted on the job board site (not supported on Indeed)
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids
├── description_format (enum): markdown, html (format type of the job descriptions)
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day)
├── hours_old (int): filters jobs by the number of hours since the job was posted (ZipRecruiter and Glassdoor round up to next day. If you use this on Indeed, it will not filter by job_type or is_remote)
```
### JobPost Schema
@@ -100,24 +100,26 @@ JobPost
│ └── currency (enum)
└── date_posted (date)
└── emails (str)
└── num_urgent_words (int)
└── is_remote (bool)
Indeed specific
├── company_country (str)
└── company_addresses (str)
└── company_industry (str)
└── company_employees_label (str)
└── company_revenue_label (str)
└── company_description (str)
└── ceo_name (str)
└── ceo_photo_url (str)
└── logo_photo_url (str)
└── banner_photo_url (str)
```
### Exceptions
The following exceptions may be raised when using JobSpy:
* `LinkedInException`
* `IndeedException`
* `ZipRecruiterException`
* `GlassdoorException`
## Supported Countries for Job Searching
### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we are using
### **ZipRecruiter**
@@ -147,10 +149,14 @@ You can specify the following countries when searching on Indeed (use the exact
| South Korea | Spain* | Sweden | Switzerland* |
| Taiwan | Thailand | Turkey | Ukraine |
| United Arab Emirates | UK* | USA* | Uruguay |
| Venezuela | Vietnam | | |
| Venezuela | Vietnam* | | |
Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
## Notes
* Indeed is the best scraper currently with no rate limiting.
* Glassdoor/Ziprecruiter can only fetch 900/1000 jobs from the endpoints we are using on a given search.
* LinkedIn is the most restrictive and usually rate limits around the 10th page.
## Frequently Asked Questions
---
@@ -168,7 +174,3 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
- Trying a VPN or proxy to change your IP address.
---

232
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
[[package]]
name = "annotated-types"
@@ -203,6 +203,52 @@ soupsieve = ">1.2"
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]]
name = "black"
version = "24.2.0"
description = "The uncompromising code formatter."
optional = false
python-versions = ">=3.8"
files = [
{file = "black-24.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6981eae48b3b33399c8757036c7f5d48a535b962a7c2310d19361edeef64ce29"},
{file = "black-24.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d533d5e3259720fdbc1b37444491b024003e012c5173f7d06825a77508085430"},
{file = "black-24.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61a0391772490ddfb8a693c067df1ef5227257e72b0e4108482b8d41b5aee13f"},
{file = "black-24.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:992e451b04667116680cb88f63449267c13e1ad134f30087dec8527242e9862a"},
{file = "black-24.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:163baf4ef40e6897a2a9b83890e59141cc8c2a98f2dda5080dc15c00ee1e62cd"},
{file = "black-24.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e37c99f89929af50ffaf912454b3e3b47fd64109659026b678c091a4cd450fb2"},
{file = "black-24.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9de21bafcba9683853f6c96c2d515e364aee631b178eaa5145fc1c61a3cc92"},
{file = "black-24.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:9db528bccb9e8e20c08e716b3b09c6bdd64da0dd129b11e160bf082d4642ac23"},
{file = "black-24.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d84f29eb3ee44859052073b7636533ec995bd0f64e2fb43aeceefc70090e752b"},
{file = "black-24.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e08fb9a15c914b81dd734ddd7fb10513016e5ce7e6704bdd5e1251ceee51ac9"},
{file = "black-24.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:810d445ae6069ce64030c78ff6127cd9cd178a9ac3361435708b907d8a04c693"},
{file = "black-24.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ba15742a13de85e9b8f3239c8f807723991fbfae24bad92d34a2b12e81904982"},
{file = "black-24.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7e53a8c630f71db01b28cd9602a1ada68c937cbf2c333e6ed041390d6968faf4"},
{file = "black-24.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:93601c2deb321b4bad8f95df408e3fb3943d85012dddb6121336b8e24a0d1218"},
{file = "black-24.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0057f800de6acc4407fe75bb147b0c2b5cbb7c3ed110d3e5999cd01184d53b0"},
{file = "black-24.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:faf2ee02e6612577ba0181f4347bcbcf591eb122f7841ae5ba233d12c39dcb4d"},
{file = "black-24.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:057c3dc602eaa6fdc451069bd027a1b2635028b575a6c3acfd63193ced20d9c8"},
{file = "black-24.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:08654d0797e65f2423f850fc8e16a0ce50925f9337fb4a4a176a7aa4026e63f8"},
{file = "black-24.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca610d29415ee1a30a3f30fab7a8f4144e9d34c89a235d81292a1edb2b55f540"},
{file = "black-24.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:4dd76e9468d5536abd40ffbc7a247f83b2324f0c050556d9c371c2b9a9a95e31"},
{file = "black-24.2.0-py3-none-any.whl", hash = "sha256:e8a6ae970537e67830776488bca52000eaa37fa63b9988e8c487458d9cd5ace6"},
{file = "black-24.2.0.tar.gz", hash = "sha256:bce4f25c27c3435e4dace4815bcb2008b87e167e3bf4ee47ccdc5ce906eb4894"},
]
[package.dependencies]
click = ">=8.0.0"
mypy-extensions = ">=0.4.3"
packaging = ">=22.0"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "bleach"
version = "6.0.0"
@@ -308,6 +354,17 @@ files = [
[package.dependencies]
pycparser = "*"
[[package]]
name = "cfgv"
version = "3.4.0"
description = "Validate configuration and produce human readable error messages."
optional = false
python-versions = ">=3.8"
files = [
{file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
]
[[package]]
name = "charset-normalizer"
version = "3.2.0"
@@ -392,6 +449,20 @@ files = [
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
]
[[package]]
name = "click"
version = "8.1.7"
description = "Composable command line interface toolkit"
optional = false
python-versions = ">=3.7"
files = [
{file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
{file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "colorama"
version = "0.4.6"
@@ -471,6 +542,17 @@ files = [
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
]
[[package]]
name = "distlib"
version = "0.3.8"
description = "Distribution utilities"
optional = false
python-versions = "*"
files = [
{file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"},
{file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
]
[[package]]
name = "exceptiongroup"
version = "1.1.3"
@@ -513,6 +595,22 @@ files = [
[package.extras]
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
[[package]]
name = "filelock"
version = "3.13.1"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.8"
files = [
{file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"},
{file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"},
]
[package.extras]
docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"]
testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
typing = ["typing-extensions (>=4.8)"]
[[package]]
name = "fqdn"
version = "1.5.1"
@@ -525,16 +623,19 @@ files = [
]
[[package]]
name = "html2text"
version = "2020.1.16"
description = "Turn HTML into equivalent Markdown-structured text."
name = "identify"
version = "2.5.35"
description = "File identification library for Python"
optional = false
python-versions = ">=3.5"
python-versions = ">=3.8"
files = [
{file = "html2text-2020.1.16-py3-none-any.whl", hash = "sha256:c7c629882da0cf377d66f073329ccf34a12ed2adf0169b9285ae4e63ef54c82b"},
{file = "html2text-2020.1.16.tar.gz", hash = "sha256:e296318e16b059ddb97f7a8a1d6a5c1d7af4544049a01e261731d2d5cc277bbb"},
{file = "identify-2.5.35-py2.py3-none-any.whl", hash = "sha256:c4de0081837b211594f8e877a6b4fad7ca32bbfc1a9307fdd61c28bfe923f13e"},
{file = "identify-2.5.35.tar.gz", hash = "sha256:10a7ca245cfcd756a554a7288159f72ff105ad233c7c4b9c6f0f4d108f5f6791"},
]
[package.extras]
license = ["ukkonen"]
[[package]]
name = "idna"
version = "3.4"
@@ -1037,6 +1138,21 @@ files = [
{file = "jupyterlab_widgets-3.0.8.tar.gz", hash = "sha256:d428ab97b8d87cc7c54cbf37644d6e0f0e662f23876e05fa460a73ec3257252a"},
]
[[package]]
name = "markdownify"
version = "0.11.6"
description = "Convert HTML to markdown."
optional = false
python-versions = "*"
files = [
{file = "markdownify-0.11.6-py3-none-any.whl", hash = "sha256:ba35fe289d5e9073bcd7d2cad629278fe25f1a93741fcdc0bfb4f009076d8324"},
{file = "markdownify-0.11.6.tar.gz", hash = "sha256:009b240e0c9f4c8eaf1d085625dcd4011e12f0f8cec55dedf9ea6f7655e49bfe"},
]
[package.dependencies]
beautifulsoup4 = ">=4.9,<5"
six = ">=1.15,<2"
[[package]]
name = "markupsafe"
version = "2.1.3"
@@ -1064,16 +1180,6 @@ files = [
{file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
{file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
{file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
{file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
{file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
{file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
{file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
{file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
@@ -1131,6 +1237,17 @@ files = [
{file = "mistune-3.0.1.tar.gz", hash = "sha256:e912116c13aa0944f9dc530db38eb88f6a77087ab128f49f84a48f4c05ea163c"},
]
[[package]]
name = "mypy-extensions"
version = "1.0.0"
description = "Type system extensions for programs checked with the mypy type checker."
optional = false
python-versions = ">=3.5"
files = [
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
]
[[package]]
name = "nbclient"
version = "0.8.0"
@@ -1222,6 +1339,20 @@ files = [
{file = "nest_asyncio-1.5.7.tar.gz", hash = "sha256:6a80f7b98f24d9083ed24608977c09dd608d83f91cccc24c9d2cba6d10e01c10"},
]
[[package]]
name = "nodeenv"
version = "1.8.0"
description = "Node.js virtual environment builder"
optional = false
python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*"
files = [
{file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"},
{file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"},
]
[package.dependencies]
setuptools = "*"
[[package]]
name = "notebook"
version = "7.0.3"
@@ -1408,6 +1539,17 @@ files = [
qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
testing = ["docopt", "pytest (<6.0.0)"]
[[package]]
name = "pathspec"
version = "0.12.1"
description = "Utility library for gitignore style pattern matching of file paths."
optional = false
python-versions = ">=3.8"
files = [
{file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
{file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
]
[[package]]
name = "pexpect"
version = "4.8.0"
@@ -1463,6 +1605,24 @@ files = [
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pre-commit"
version = "3.6.2"
description = "A framework for managing and maintaining multi-language pre-commit hooks."
optional = false
python-versions = ">=3.9"
files = [
{file = "pre_commit-3.6.2-py2.py3-none-any.whl", hash = "sha256:ba637c2d7a670c10daedc059f5c49b5bd0aadbccfcd7ec15592cf9665117532c"},
{file = "pre_commit-3.6.2.tar.gz", hash = "sha256:c3ef34f463045c88658c5b99f38c1e297abdcc0ff13f98d3370055fbbfabc67e"},
]
[package.dependencies]
cfgv = ">=2.0.0"
identify = ">=1.0.0"
nodeenv = ">=0.11.1"
pyyaml = ">=5.1"
virtualenv = ">=20.10.0"
[[package]]
name = "prometheus-client"
version = "0.17.1"
@@ -2189,6 +2349,22 @@ nativelib = ["pyobjc-framework-Cocoa", "pywin32"]
objc = ["pyobjc-framework-Cocoa"]
win32 = ["pywin32"]
[[package]]
name = "setuptools"
version = "69.1.1"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.8"
files = [
{file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"},
{file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"},
]
[package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
[[package]]
name = "six"
version = "1.16.0"
@@ -2389,6 +2565,26 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
zstd = ["zstandard (>=0.18.0)"]
[[package]]
name = "virtualenv"
version = "20.25.1"
description = "Virtual Python Environment builder"
optional = false
python-versions = ">=3.7"
files = [
{file = "virtualenv-20.25.1-py3-none-any.whl", hash = "sha256:961c026ac520bac5f69acb8ea063e8a4f071bcc9457b9c1f28f6b085c511583a"},
{file = "virtualenv-20.25.1.tar.gz", hash = "sha256:e08e13ecdca7a0bd53798f356d5831434afa5b07b93f0abdf0797b7a06ffe197"},
]
[package.dependencies]
distlib = ">=0.3.7,<1"
filelock = ">=3.12.2,<4"
platformdirs = ">=3.9.1,<5"
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
[[package]]
name = "wcwidth"
version = "0.2.6"
@@ -2456,4 +2652,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "eea3694820df164179cdd8312d382eb5b29d6317c4d34c586e8866c69aaee9e9"
content-hash = "6ee18819a726314f61f20f0ed93b2db2a26c232269f045146d9a8f4e3f31eb01"

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.46"
version = "1.1.50"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
@@ -17,14 +17,19 @@ beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0"
html2text = "^2020.1.16"
tls-client = "^1.0.1"
markdownify = "^0.11.6"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.1"
jupyter = "^1.0.0"
black = "^24.2.0"
pre-commit = "^3.6.2"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 88

View File

@@ -1,8 +1,11 @@
from __future__ import annotations
import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from .jobs import JobType, Location
from .scrapers.utils import logger
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
@@ -20,7 +23,7 @@ def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
location: str | None = None,
distance: int | None = None,
distance: int | None = 50,
is_remote: bool = False,
job_type: str | None = None,
easy_apply: bool | None = None,
@@ -69,6 +72,7 @@ def scrape_jobs(
for site in site_name
]
return site_types
country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput(
@@ -85,13 +89,16 @@ def scrape_jobs(
results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids,
offset=offset,
hours_old=hours_old
hours_old=hours_old,
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)
scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
logger.info(f"{site_name} finished scraping")
return site.value, scraped_data
site_to_jobs_dict = {}
@@ -114,9 +121,8 @@ def scrape_jobs(
for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs:
job_data = job.dict()
job_data[
"job_url_hyper"
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
job_url = job_data["job_url"]
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
job_data["site"] = site
job_data["company"] = job_data["company_name"]
job_data["job_type"] = (
@@ -153,18 +159,18 @@ def scrape_jobs(
if jobs_dfs:
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
# Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
# Desired column order
desired_order = [
"job_url_hyper" if hyperlinks else "job_url",
"site",
"job_url_hyper" if hyperlinks else "job_url",
"job_url_direct",
"title",
"company",
"company_url",
"location",
"job_type",
"date_posted",
@@ -173,10 +179,19 @@ def scrape_jobs(
"max_amount",
"currency",
"is_remote",
"num_urgent_words",
"benefits",
"emails",
"description",
"company_url",
"company_url_direct",
"company_addresses",
"company_industry",
"company_num_employees",
"company_revenue",
"company_description",
"logo_photo_url",
"banner_photo_url",
"ceo_name",
"ceo_photo_url",
]
# Step 3: Ensure all desired columns are present, adding missing ones as empty
@@ -188,6 +203,6 @@ def scrape_jobs(
jobs_df = jobs_df[desired_order]
# Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
else:
return pd.DataFrame()

View File

@@ -1,3 +1,5 @@
from __future__ import annotations
from typing import Optional
from datetime import date
from enum import Enum
@@ -57,7 +59,7 @@ class JobType(Enum):
class Country(Enum):
"""
Gets the subdomain for Indeed and Glassdoor.
The second item in the tuple is the subdomain for Indeed
The second item in the tuple is the subdomain (and API country code if there's a ':' separator) for Indeed
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
"""
@@ -118,11 +120,11 @@ class Country(Enum):
TURKEY = ("turkey", "tr")
UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ("uk,united kingdom", "uk", "co.uk")
USA = ("usa,us,united states", "www", "com")
UK = ("uk,united kingdom", "uk:gb", "co.uk")
USA = ("usa,us,united states", "www:us", "com")
URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve")
VIETNAM = ("vietnam", "vn")
VIETNAM = ("vietnam", "vn", "com")
# internal for ziprecruiter
US_CANADA = ("usa/ca", "www")
@@ -132,7 +134,10 @@ class Country(Enum):
@property
def indeed_domain_value(self):
return self.value[1]
subdomain, _, api_country_code = self.value[1].partition(":")
if subdomain and api_country_code:
return subdomain, api_country_code.upper()
return self.value[1], self.value[1].upper()
@property
def glassdoor_domain_value(self):
@@ -145,7 +150,7 @@ class Country(Enum):
else:
raise Exception(f"Glassdoor is not available for {self.name}")
def get_url(self):
def get_glassdoor_url(self):
return f"https://{self.glassdoor_domain_value}/"
@classmethod
@@ -153,7 +158,7 @@ class Country(Enum):
"""Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower()
for country in cls:
country_names = country.value[0].split(',')
country_names = country.value[0].split(",")
if country_str in country_names:
return country
valid_countries = [country.value for country in cls]
@@ -163,7 +168,7 @@ class Country(Enum):
class Location(BaseModel):
country: Country | None = None
country: Country | str | None = None
city: Optional[str] = None
state: Optional[str] = None
@@ -173,7 +178,12 @@ class Location(BaseModel):
location_parts.append(self.city)
if self.state:
location_parts.append(self.state)
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
if isinstance(self.country, str):
location_parts.append(self.country)
elif self.country and self.country not in (
Country.US_CANADA,
Country.WORLDWIDE,
):
country_name = self.country.value[0]
if "," in country_name:
country_name = country_name.split(",")[0]
@@ -217,21 +227,31 @@ class DescriptionFormat(Enum):
class JobPost(BaseModel):
title: str
company_name: str
company_name: str | None
job_url: str
job_url_direct: str | None = None
location: Optional[Location]
description: str | None = None
company_url: str | None = None
company_url_direct: str | None = None
job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None
benefits: str | None = None
emails: list[str] | None = None
num_urgent_words: int | None = None
is_remote: bool | None = None
# company_industry: str | None = None
# indeed specific
company_addresses: str | None = None
company_industry: str | None = None
company_num_employees: str | None = None
company_revenue: str | None = None
company_description: str | None = None
ceo_name: str | None = None
ceo_photo_url: str | None = None
logo_photo_url: str | None = None
banner_photo_url: str | None = None
class JobResponse(BaseModel):

View File

@@ -1,10 +1,12 @@
from __future__ import annotations
from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat
DescriptionFormat,
)

View File

@@ -4,19 +4,23 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor.
"""
from __future__ import annotations
import re
import json
import requests
from typing import Optional
from typing import Optional, Tuple
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from ..utils import count_urgent_words, extract_emails_from_text
from .. import Scraper, ScraperInput, Site
from ..utils import extract_emails_from_text
from ..exceptions import GlassdoorException
from ..utils import (
create_session,
markdown_converter,
logger
logger,
)
from ...jobs import (
JobPost,
@@ -25,7 +29,7 @@ from ...jobs import (
Location,
JobResponse,
JobType,
DescriptionFormat
DescriptionFormat,
)
@@ -42,6 +46,7 @@ class GlassdoorScraper(Scraper):
self.session = None
self.scraper_input = None
self.jobs_per_page = 30
self.max_pages = 30
self.seen_urls = set()
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
@@ -52,39 +57,37 @@ class GlassdoorScraper(Scraper):
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_url()
self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
token = self._get_csrf_token()
self.headers["gd-csrf-token"] = token if token else self.fallback_token
location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote
)
if location_type is None:
logger.error("Glassdoor: location not parsed")
return JobResponse(jobs=[])
all_jobs: list[JobPost] = []
cursor = None
max_pages = 30
self.session = create_session(self.proxy, is_tls=False, has_retry=True)
self.session.get(self.base_url)
try:
for page in range(
1 + (scraper_input.offset // self.jobs_per_page),
min(
(scraper_input.results_wanted // self.jobs_per_page) + 2,
max_pages + 1,
),
):
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end):
logger.info(f"Glassdoor search page: {page}")
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
all_jobs.extend(jobs)
if len(all_jobs) >= scraper_input.results_wanted:
if not jobs or len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted]
break
except Exception as e:
raise GlassdoorException(str(e))
except Exception as e:
raise GlassdoorException(str(e))
logger.error(f"Glassdoor: {str(e)}")
break
return JobResponse(jobs=all_jobs)
def _fetch_jobs_page(
@@ -94,63 +97,86 @@ class GlassdoorScraper(Scraper):
location_type: str,
page_num: int,
cursor: str | None,
) -> (list[JobPost], str | None):
) -> Tuple[list[JobPost], str | None]:
"""
Scrapes a page of Glassdoor for jobs with scraper_input criteria
"""
jobs = []
self.scraper_input = scraper_input
try:
payload = self._add_payload(
location_id, location_type, page_num, cursor
)
payload = self._add_payload(location_id, location_type, page_num, cursor)
response = self.session.post(
f"{self.base_url}/graph", headers=self.headers, timeout=10, data=payload
f"{self.base_url}/graph",
headers=self.headers,
timeout_seconds=15,
data=payload,
)
if response.status_code != 200:
raise GlassdoorException(
f"bad response status code: {response.status_code}"
)
exc_msg = f"bad response status code: {response.status_code}"
raise GlassdoorException(exc_msg)
res_json = response.json()[0]
if "errors" in res_json:
raise ValueError("Error encountered in API response")
except Exception as e:
raise GlassdoorException(str(e))
except (
requests.exceptions.ReadTimeout,
GlassdoorException,
ValueError,
Exception,
) as e:
logger.error(f"Glassdoor: {str(e)}")
return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"]
jobs = []
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
future_to_job_data = {
executor.submit(self._process_job, job): job for job in jobs_data
}
for future in as_completed(future_to_job_data):
try:
job_post = future.result()
if job_post:
jobs.append(job_post)
except Exception as exc:
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
return jobs, self.get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
)
def _get_csrf_token(self):
"""
Fetches csrf token needed for API by visiting a generic page
"""
res = self.session.get(
f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers
)
pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text)
token = None
if matches:
token = matches[0]
return token
def _process_job(self, job_data):
"""
Processes a single job and fetches its description.
"""
job_id = job_data["jobview"]["job"]["listingId"]
job_url = f'{self.base_url}job-listing/j?jl={job_id}'
job_url = f"{self.base_url}job-listing/j?jl={job_id}"
if job_url in self.seen_urls:
return None
self.seen_urls.add(job_url)
job = job_data["jobview"]
title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"]
company_id = job_data['jobview']['header']['employer']['id']
company_id = job_data["jobview"]["header"]["employer"]["id"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days is not None else None
date_diff = (datetime.now() - timedelta(days=age_in_days)).date()
date_posted = date_diff if age_in_days is not None else None
if location_type == "S":
is_remote = True
@@ -162,9 +188,10 @@ class GlassdoorScraper(Scraper):
description = self._fetch_job_description(job_id)
except:
description = None
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
return JobPost(
title=title,
company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None,
company_url=company_url if company_id else None,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
@@ -173,7 +200,6 @@ class GlassdoorScraper(Scraper):
is_remote=is_remote,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
def _fetch_job_description(self, job_id):
@@ -187,7 +213,7 @@ class GlassdoorScraper(Scraper):
"variables": {
"jl": job_id,
"queryString": "q",
"pageTypeEnum": "SERP"
"pageTypeEnum": "SERP",
},
"query": """
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
@@ -202,28 +228,33 @@ class GlassdoorScraper(Scraper):
__typename
}
}
"""
""",
}
]
res = requests.post(url, json=body, headers=self.headers)
if res.status_code != 200:
return None
data = res.json()[0]
desc = data['data']['jobview']['job']['description']
return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc
desc = data["data"]["jobview"]["job"]["description"]
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
desc = markdown_converter(desc)
return desc
def _get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote:
return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True)
res = session.get(url)
res = self.session.get(url, headers=self.headers)
if res.status_code != 200:
if res.status_code == 429:
logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
err = f"429 Response - Blocked by Glassdoor for too many requests"
logger.error(err)
return None, None
else:
logger.error(f'Glassdoor response status code {res.status_code}')
err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}"
logger.error(f"Glassdoor response status code {res.status_code}")
return None, None
items = res.json()
@@ -234,7 +265,7 @@ class GlassdoorScraper(Scraper):
location_type = "CITY"
elif location_type == "S":
location_type = "STATE"
elif location_type == 'N':
elif location_type == "N":
location_type = "COUNTRY"
return int(items[0]["locationId"]), location_type
@@ -245,7 +276,9 @@ class GlassdoorScraper(Scraper):
page_num: int,
cursor: str | None = None,
) -> str:
fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
fromage = None
if self.scraper_input.hours_old:
fromage = max(self.scraper_input.hours_old // 24, 1)
filter_params = []
if self.scraper_input.easy_apply:
filter_params.append({"filterKey": "applicationType", "values": "1"})
@@ -264,9 +297,76 @@ class GlassdoorScraper(Scraper):
"pageNumber": page_num,
"pageCursor": cursor,
"fromage": fromage,
"sort": "date"
"sort": "date",
},
"query": """
"query": self.query_template,
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok"
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
query_template = """
query JobSearchResultsQuery(
$excludeJobListingIds: [Long!],
$keyword: String,
@@ -432,69 +532,3 @@ class GlassdoorScraper(Scraper):
__typename
}
"""
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}

View File

@@ -4,25 +4,22 @@ jobspy.scrapers.indeed
This module contains routines to scrape Indeed.
"""
import re
import math
import json
import requests
from typing import Any
from datetime import datetime
from bs4 import BeautifulSoup
from bs4.element import Tag
from __future__ import annotations
import math
from typing import Tuple
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException
import requests
from .. import Scraper, ScraperInput, Site
from ..utils import (
count_urgent_words,
extract_emails_from_text,
create_session,
get_enum_from_job_type,
markdown_converter,
logger
logger,
)
from ...jobs import (
JobPost,
@@ -31,20 +28,21 @@ from ...jobs import (
Location,
JobResponse,
JobType,
DescriptionFormat
DescriptionFormat,
)
from .. import Scraper, ScraperInput, Site
class IndeedScraper(Scraper):
def __init__(self, proxy: str | None = None):
"""
Initializes IndeedScraper with the Indeed job search url
Initializes IndeedScraper with the Indeed API url
"""
self.scraper_input = None
self.jobs_per_page = 25
self.jobs_per_page = 100
self.num_workers = 10
self.seen_urls = set()
self.headers = None
self.api_country_code = None
self.base_url = None
self.api_url = "https://apis.indeed.com/graphql"
site = Site(Site.INDEED)
@@ -57,285 +55,272 @@ class IndeedScraper(Scraper):
:return: job_response
"""
self.scraper_input = scraper_input
job_list = self._scrape_page()
pages_processed = 1
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com"
self.headers = self.api_headers.copy()
self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
job_list = []
page = 1
cursor = None
offset_pages = math.ceil(self.scraper_input.offset / 100)
for _ in range(offset_pages):
logger.info(f"Indeed skipping search page: {page}")
__, cursor = self._scrape_page(cursor)
if not __:
logger.info(f"Indeed found no jobs on page: {page}")
break
while len(self.seen_urls) < scraper_input.results_wanted:
pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page)
new_jobs = False
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self._scrape_page, page + pages_processed)
for page in range(pages_to_process)
]
for future in futures:
jobs = future.result()
if jobs:
logger.info(f"Indeed search page: {page}")
jobs, cursor = self._scrape_page(cursor)
if not jobs:
logger.info(f"Indeed found no jobs on page: {page}")
break
job_list += jobs
new_jobs = True
if len(self.seen_urls) >= scraper_input.results_wanted:
break
page += 1
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
pages_processed += pages_to_process
if not new_jobs:
break
if len(self.seen_urls) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _scrape_page(self, page: int=0) -> list[JobPost]:
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param page:
:return: jobs found on page, total number of jobs found for search
:param cursor:
:return: jobs found on page, next page cursor
"""
job_list = []
domain = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com"
try:
session = create_session(self.proxy)
response = session.get(
f"{self.base_url}/m/jobs",
headers=self.headers,
params=self._add_params(page),
jobs = []
new_cursor = None
filters = self._build_filters()
query = self.job_search_query.format(
what=(
f'what: "{self.scraper_input.search_term}"'
if self.scraper_input.search_term
else ""
),
location=(
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
if self.scraper_input.location
else ""
),
dateOnIndeed=self.scraper_input.hours_old,
cursor=f'cursor: "{cursor}"' if cursor else "",
filters=filters,
)
if response.status_code not in range(200, 400):
if response.status_code == 429:
logger.error(f'429 Response - Blocked by Indeed for too many requests')
else:
logger.error(f'Indeed response status code {response.status_code}')
return job_list
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return job_list
soup = BeautifulSoup(response.content, "html.parser")
if "did not match any jobs" in response.text:
return job_list
jobs = IndeedScraper._parse_jobs(soup)
if not jobs:
return []
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
logger.error("Indeed - No jobs found.")
return []
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
job_keys = [job['jobkey'] for job in jobs]
jobs_detailed = self._get_job_details(job_keys)
payload = {
"query": query,
}
api_headers = self.api_headers.copy()
api_headers["indeed-co"] = self.api_country_code
response = requests.post(
self.api_url,
headers=api_headers,
json=payload,
proxies=self.proxy,
timeout=10,
)
if response.status_code != 200:
logger.info(
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)"
)
return jobs, new_cursor
data = response.json()
jobs = data["data"]["jobSearch"]["results"]
new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
job_results: list[Future] = [
executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
executor.submit(self._process_job, job["job"]) for job in jobs
]
job_list = [result.result() for result in job_results if result.result()]
return job_list, new_cursor
return job_list
def _build_filters(self):
"""
Builds the filters dict for job type/is_remote. If hours_old is provided, composite filter for job_type/is_remote is not possible.
IndeedApply: filters: { keyword: { field: "indeedApplyScope", keys: ["DESKTOP"] } }
"""
filters_str = ""
if self.scraper_input.hours_old:
filters_str = """
filters: {{
date: {{
field: "dateOnIndeed",
start: "{start}h"
}}
}}
""".format(
start=self.scraper_input.hours_old
)
elif self.scraper_input.job_type or self.scraper_input.is_remote:
job_type_key_mapping = {
JobType.FULL_TIME: "CF3CP",
JobType.PART_TIME: "75GKK",
JobType.CONTRACT: "NJXCK",
JobType.INTERNSHIP: "VDTG7",
}
def _process_job(self, job: dict, job_detailed: dict) -> JobPost | None:
job_url = f'{self.base_url}/m/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.base_url}/viewjob?jk={job["jobkey"]}'
keys = []
if self.scraper_input.job_type:
key = job_type_key_mapping[self.scraper_input.job_type]
keys.append(key)
if self.scraper_input.is_remote:
keys.append("DSQF7")
if keys:
keys_str = '", "'.join(keys) # Prepare your keys string
filters_str = f"""
filters: {{
composite: {{
filters: [{{
keyword: {{
field: "attributes",
keys: ["{keys_str}"]
}}
}}]
}}
}}
"""
return filters_str
def _process_job(self, job: dict) -> JobPost | None:
"""
Parses the job dict into JobPost model
:param job: dict to parse
:return: JobPost if it's a new job
"""
job_url = f'{self.base_url}/viewjob?jk={job["key"]}'
if job_url in self.seen_urls:
return None
return
self.seen_urls.add(job_url)
description = job_detailed['description']['html']
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
job_type = self._get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
description = job["description"]["html"]
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
job_type = self._get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
employer = job["employer"].get("dossier") if job["employer"] else None
employer_details = employer.get("employerDetails", {}) if employer else {}
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
return JobPost(
title=job["normTitle"],
title=job["title"],
description=description,
company_name=job["company"],
company_url=f"{self.base_url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed[
'employer'] else None,
company_name=job["employer"].get("name") if job.get("employer") else None,
company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
company_url_direct=(
employer["links"]["corporateWebsite"] if employer else None
),
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
country=self.scraper_input.country,
city=job.get("location", {}).get("city"),
state=job.get("location", {}).get("admin1Code"),
country=job.get("location", {}).get("countryCode"),
),
job_type=job_type,
compensation=self._get_compensation(job, job_detailed),
compensation=self._get_compensation(job),
date_posted=date_posted,
job_url=job_url_client,
job_url=job_url,
job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
),
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
is_remote=self._is_job_remote(job, job_detailed, description)
is_remote=self._is_job_remote(job, description),
company_addresses=(
employer_details["addresses"][0]
if employer_details.get("addresses")
else None
),
company_industry=(
employer_details["industry"]
.replace("Iv1", "")
.replace("_", " ")
.title()
if employer_details.get("industry")
else None
),
company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"),
ceo_name=employer_details.get("ceoName"),
ceo_photo_url=employer_details.get("ceoPhotoUrl"),
logo_photo_url=(
employer["images"].get("squareLogoUrl")
if employer and employer.get("images")
else None
),
banner_photo_url=(
employer["images"].get("headerImageUrl")
if employer and employer.get("images")
else None
),
)
def _get_job_details(self, job_keys: list[str]) -> dict:
"""
Queries the GraphQL endpoint for detailed job information for the given job keys.
"""
job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']'
payload = dict(self.api_payload)
payload["query"] = self.api_payload["query"].format(job_keys_gql=job_keys_gql)
response = requests.post(self.api_url, headers=self.api_headers, json=payload, proxies=self.proxy)
if response.status_code == 200:
return response.json()['data']['jobData']['results']
else:
return {}
def _add_params(self, page: int) -> dict[str, str | Any]:
fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
params = {
"q": self.scraper_input.search_term,
"l": self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1],
"filter": 0,
"start": self.scraper_input.offset + page * 10,
"sort": "date",
"fromage": fromage,
}
if self.scraper_input.distance:
params["radius"] = self.scraper_input.distance
sc_values = []
if self.scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if self.scraper_input.job_type:
sc_values.append("jt({})".format(self.scraper_input.job_type.value[0]))
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
if self.scraper_input.easy_apply:
params['iafilter'] = 1
return params
@staticmethod
def _get_job_type(job: dict) -> list[JobType] | None:
def _get_job_type(attributes: list) -> list[JobType]:
"""
Parses the job to get list of job types
:param job:
:return:
Parses the attributes to get list of job types
:param attributes:
:return: list of JobType
"""
job_types: list[JobType] = []
for taxonomy in job["taxonomyAttributes"]:
if taxonomy["label"] == "job-types":
for i in range(len(taxonomy["attributes"])):
label = taxonomy["attributes"][i].get("label")
if label:
job_type_str = label.replace("-", "").replace(" ", "").lower()
for attribute in attributes:
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types
@staticmethod
def _get_compensation(job: dict, job_detailed: dict) -> Compensation:
def _get_compensation(job: dict) -> Compensation | None:
"""
Parses the job to get
Parses the job to get compensation
:param job:
:param job:
:param job_detailed:
:return: compensation object
"""
comp = job_detailed['compensation']['baseSalary']
if comp:
interval = IndeedScraper._get_correct_interval(comp['unitOfWork'])
if interval:
comp = job["compensation"]["baseSalary"]
if not comp:
return None
interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
if not interval:
return None
min_range = comp["range"].get("min")
max_range = comp["range"].get("max")
return Compensation(
interval=interval,
min_amount=round(comp['range'].get('min'), 2) if comp['range'].get('min') is not None else None,
max_amount=round(comp['range'].get('max'), 2) if comp['range'].get('max') is not None else None,
currency=job_detailed['compensation']['currencyCode']
min_amount=round(min_range, 2) if min_range is not None else None,
max_amount=round(max_range, 2) if max_range is not None else None,
currency=job["compensation"]["currencyCode"],
)
extracted_salary = job.get("extractedSalary")
compensation = None
if extracted_salary:
salary_snippet = job.get("salarySnippet")
currency = salary_snippet.get("currency") if salary_snippet else None
interval = (extracted_salary.get("type"),)
if isinstance(interval, tuple):
interval = interval[0]
interval = interval.upper()
if interval in CompensationInterval.__members__:
compensation = Compensation(
interval=CompensationInterval[interval],
min_amount=int(extracted_salary.get("min")),
max_amount=int(extracted_salary.get("max")),
currency=currency,
)
return compensation
@staticmethod
def _parse_jobs(soup: BeautifulSoup) -> dict:
def _is_job_remote(job: dict, description: str) -> bool:
"""
Parses the jobs from the soup object
:param soup:
:return: jobs
Searches the description, location, and attributes to check if job is remote
"""
def find_mosaic_script() -> Tag | None:
script_tags = soup.find_all("script")
for tag in script_tags:
if (
tag.string
and "mosaic.providerData" in tag.string
and "mosaic-provider-jobcards" in tag.string
):
return tag
return None
script_tag = find_mosaic_script()
if script_tag:
script_str = script_tag.string
pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});'
p = re.compile(pattern, re.DOTALL)
m = p.search(script_str)
if m:
jobs = json.loads(m.group(1).strip())
return jobs
else:
logger.warning(f'Indeed: Could not find mosaic provider job cards data')
return {}
else:
logger.warning(f"Indeed: Could not parse any jobs on the page")
return {}
@staticmethod
def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
remote_keywords = ['remote', 'work from home', 'wfh']
remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any(
any(keyword in attr['label'].lower() for keyword in remote_keywords)
for attr in job_detailed['attributes']
any(keyword in attr["label"].lower() for keyword in remote_keywords)
for attr in job["attributes"]
)
is_remote_in_description = any(
keyword in description.lower() for keyword in remote_keywords
)
is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords)
is_remote_in_location = any(
keyword in job_detailed['location']['formatted']['long'].lower()
keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords
)
is_remote_in_taxonomy = any(
taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
for taxonomy in job.get("taxonomyAttributes", [])
return (
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
)
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy
@staticmethod
def _get_correct_interval(interval: str) -> CompensationInterval:
def _get_compensation_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"DAY": "DAILY",
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY"
"MONTH": "MONTHLY",
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
@@ -343,43 +328,44 @@ class IndeedScraper(Scraper):
else:
raise ValueError(f"Unsupported interval: {interval}")
headers = {
'Host': 'www.indeed.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'sec-fetch-site': 'same-origin',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
'sec-fetch-mode': 'navigate',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0',
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
}
api_headers = {
'Host': 'apis.indeed.com',
'content-type': 'application/json',
'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8',
'accept': 'application/json',
'indeed-locale': 'en-US',
'accept-language': 'en-US,en;q=0.9',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1',
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone',
'indeed-co': 'US',
"Host": "apis.indeed.com",
"content-type": "application/json",
"indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
"accept": "application/json",
"indeed-locale": "en-US",
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
"indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
}
api_payload = {
"query": """
job_search_query = """
query GetJobData {{
jobData(input: {{
jobKeys: {job_keys_gql}
}}) {{
jobSearch(
{what}
{location}
includeSponsoredResults: NONE
limit: 100
sort: DATE
{cursor}
{filters}
) {{
pageInfo {{
nextCursor
}}
results {{
trackingKey
job {{
key
title
datePublished
dateOnIndeed
description {{
html
}}
location {{
countryName
countryCode
admin1Code
city
postalCode
streetAddress
@@ -401,10 +387,30 @@ class IndeedScraper(Scraper):
currencyCode
}}
attributes {{
key
label
}}
employer {{
relativeCompanyPageUrl
name
dossier {{
employerDetails {{
addresses
industry
employeesLocalizedLabel
revenueLocalizedLabel
briefDescription
ceoName
ceoPhotoUrl
}}
images {{
headerImageUrl
squareLogoUrl
}}
links {{
corporateWebsite
}}
}}
}}
recruit {{
viewJobUrl
@@ -416,4 +422,3 @@ class IndeedScraper(Scraper):
}}
}}
"""
}

View File

@@ -4,13 +4,14 @@ jobspy.scrapers.linkedin
This module contains routines to scrape LinkedIn.
"""
from __future__ import annotations
import time
import random
from typing import Optional
from datetime import datetime
import requests
from requests.exceptions import ProxyError
from threading import Lock
from bs4.element import Tag
from bs4 import BeautifulSoup
@@ -26,30 +27,30 @@ from ...jobs import (
JobType,
Country,
Compensation,
DescriptionFormat
DescriptionFormat,
)
from ..utils import (
logger,
count_urgent_words,
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
markdown_converter
markdown_converter,
)
class LinkedInScraper(Scraper):
base_url = "https://www.linkedin.com"
delay = 3
band_delay = 4
jobs_per_page = 25
def __init__(self, proxy: Optional[str] = None):
"""
Initializes LinkedInScraper with the LinkedIn job search url
"""
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
self.scraper_input = None
site = Site(Site.LINKEDIN)
self.country = "worldwide"
super().__init__(site, proxy=proxy)
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@@ -63,28 +64,35 @@ class LinkedInScraper(Scraper):
url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
seconds_old = (
scraper_input.hours_old * 3600
if scraper_input.hours_old
else None
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
)
continue_search = (
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
)
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
while continue_search():
logger.info(f"LinkedIn search page: {page // 25 + 1}")
session = create_session(is_tls=False, has_retry=True, delay=5)
params = {
"keywords": scraper_input.search_term,
"location": scraper_input.location,
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": self.job_type_code(scraper_input.job_type)
"f_JT": (
self.job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None,
else None
),
"pageNum": 0,
"start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None,
"f_TPR": f"r{seconds_old}",
"f_C": (
",".join(map(str, scraper_input.linkedin_company_ids))
if scraper_input.linkedin_company_ids
else None
),
}
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
params = {k: v for k, v in params.items() if v is not None}
try:
@@ -98,16 +106,20 @@ class LinkedInScraper(Scraper):
)
if response.status_code not in range(200, 400):
if response.status_code == 429:
logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
err = (
f"429 Response - Blocked by LinkedIn for too many requests"
)
else:
logger.error(f'LinkedIn response status code {response.status_code}')
return JobResponse(job_list=job_list)
err = f"LinkedIn response status code {response.status_code}"
err += f" - {response.text}"
logger.error(err)
return JobResponse(jobs=job_list)
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f'LinkedIn: Bad proxy')
logger.error(f"LinkedIn: Bad proxy")
else:
logger.error(f'LinkedIn: {str(e)}')
return JobResponse(job_list=job_list)
logger.error(f"LinkedIn: {str(e)}")
return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card")
@@ -127,7 +139,8 @@ class LinkedInScraper(Scraper):
continue
seen_urls.add(job_url)
try:
job_post = self._process_job(job_card, job_url, scraper_input.linkedin_fetch_description)
fetch_desc = scraper_input.linkedin_fetch_description
job_post = self._process_job(job_card, job_url, fetch_desc)
if job_post:
job_list.append(job_post)
if not continue_search():
@@ -136,14 +149,16 @@ class LinkedInScraper(Scraper):
raise LinkedInException(str(e))
if continue_search():
time.sleep(random.uniform(self.delay, self.delay + 2))
page += 25
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += self.jobs_per_page
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
def _process_job(
self, job_card: Tag, job_url: str, full_descr: bool
) -> Optional[JobPost]:
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
compensation = None
if salary_tag:
@@ -187,7 +202,6 @@ class LinkedInScraper(Scraper):
except:
date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
if full_descr:
description, job_type = self._get_job_description(job_url)
@@ -199,11 +213,9 @@ class LinkedInScraper(Scraper):
date_posted=date_posted,
job_url=job_url,
compensation=compensation,
benefits=benefits,
job_type=job_type,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
def _get_job_description(
@@ -216,7 +228,9 @@ class LinkedInScraper(Scraper):
"""
try:
session = create_session(is_tls=False, has_retry=True)
response = session.get(job_page_url, headers=self.headers, timeout=5, proxies=self.proxy)
response = session.get(
job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
)
response.raise_for_status()
except:
return None, None
@@ -229,10 +243,12 @@ class LinkedInScraper(Scraper):
)
description = None
if div_content is not None:
def remove_attributes(tag):
for attr in list(tag.attrs):
del tag[attr]
return tag
div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
@@ -261,11 +277,8 @@ class LinkedInScraper(Scraper):
)
elif len(parts) == 3:
city, state, country = parts
location = Location(
city=city,
state=state,
country = Country.from_string(country)
)
location = Location(city=city, state=state, country=country)
return location
@staticmethod

View File

@@ -1,49 +1,31 @@
from __future__ import annotations
import re
import logging
import numpy as np
import html2text
import tls_client
import requests
import tls_client
import numpy as np
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType
text_maker = html2text.HTML2Text()
logger = logging.getLogger("JobSpy")
logger.propagate = False
if not logger.handlers:
logger.setLevel(logging.ERROR)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
formatter = logging.Formatter(format)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def count_urgent_words(description: str) -> int:
"""
Count the number of urgent words or phrases in a job description.
"""
urgent_patterns = re.compile(
r"\burgen(t|cy)|\bimmediate(ly)?\b|start asap|\bhiring (now|immediate(ly)?)\b",
re.IGNORECASE,
)
matches = re.findall(urgent_patterns, description)
count = len(matches)
return count
def markdown_converter(description_html: str):
if description_html is None:
return ""
text_maker.ignore_links = False
try:
markdown = text_maker.handle(description_html)
return None
markdown = md(description_html)
return markdown.strip()
except AssertionError as e:
return ""
def extract_emails_from_text(text: str) -> list[str] | None:
@@ -53,7 +35,12 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text)
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
def create_session(
proxy: dict | None = None,
is_tls: bool = True,
has_retry: bool = False,
delay: int = 1,
) -> requests.Session:
"""
Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object
@@ -67,15 +54,17 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
if proxy:
session.proxies.update(proxy)
if has_retry:
retries = Retry(total=3,
retries = Retry(
total=3,
connect=3,
status=3,
status_forcelist=[500, 502, 503, 504, 429],
backoff_factor=delay)
backoff_factor=delay,
)
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
@@ -93,17 +82,15 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str)
cur_str = re.sub("[^-0-9.,]", "", cur_str)
# Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
cur_str = re.sub("[.,]", "", cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]):
if "." in list(cur_str[-3:]):
num = float(cur_str)
elif ',' in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.'))
elif "," in list(cur_str[-3:]):
num = float(cur_str.replace(",", "."))
else:
num = float(cur_str)
return np.round(num, 2)

View File

@@ -4,6 +4,9 @@ jobspy.scrapers.ziprecruiter
This module contains routines to scrape ZipRecruiter.
"""
from __future__ import annotations
import math
import time
from datetime import datetime
@@ -14,10 +17,9 @@ from concurrent.futures import ThreadPoolExecutor
from .. import Scraper, ScraperInput, Site
from ..utils import (
logger,
count_urgent_words,
extract_emails_from_text,
create_session,
markdown_converter
markdown_converter,
)
from ...jobs import (
JobPost,
@@ -26,7 +28,7 @@ from ...jobs import (
JobResponse,
JobType,
Country,
DescriptionFormat
DescriptionFormat,
)
@@ -63,7 +65,7 @@ class ZipRecruiterScraper(Scraper):
break
if page > 1:
time.sleep(self.delay)
logger.info(f"ZipRecruiter search page: {page}")
jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token
)
@@ -90,24 +92,23 @@ class ZipRecruiterScraper(Scraper):
params["continue_from"] = continue_token
try:
res = self.session.get(
f"{self.api_url}/jobs-app/jobs",
headers=self.headers,
params=params
f"{self.api_url}/jobs-app/jobs", headers=self.headers, params=params
)
if res.status_code not in range(200, 400):
if res.status_code == 429:
logger.error(f'429 Response - Blocked by ZipRecruiter for too many requests')
err = "429 Response - Blocked by ZipRecruiter for too many requests"
else:
logger.error(f'ZipRecruiter response status code {res.status_code}')
err = f"ZipRecruiter response status code {res.status_code}"
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
logger.error(err)
return jobs_list, ""
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy')
logger.error(f"Indeed: Bad proxy")
else:
logger.error(f'Indeed: {str(e)}')
logger.error(f"Indeed: {str(e)}")
return jobs_list, ""
res_data = res.json()
jobs_list = res_data.get("jobs", [])
next_continue_token = res_data.get("continue", None)
@@ -128,7 +129,11 @@ class ZipRecruiterScraper(Scraper):
self.seen_urls.add(job_url)
description = job.get("job_description", "").strip()
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
description = (
markdown_converter(description)
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
else description
)
company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada"
country_enum = Country.from_string(country_value)
@@ -139,34 +144,33 @@ class ZipRecruiterScraper(Scraper):
job_type = self._get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower()
)
date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date()
date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
comp_interval = job.get("compensation_interval")
comp_interval = "yearly" if comp_interval == "annual" else comp_interval
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
comp_currency = job.get("compensation_currency")
return JobPost(
title=title,
company_name=company,
location=location,
job_type=job_type,
compensation=Compensation(
interval="yearly"
if job.get("compensation_interval") == "annual"
else job.get("compensation_interval"),
min_amount=int(job["compensation_min"])
if "compensation_min" in job
else None,
max_amount=int(job["compensation_max"])
if "compensation_max" in job
else None,
currency=job.get("compensation_currency"),
interval=comp_interval,
min_amount=comp_min,
max_amount=comp_max,
currency=comp_currency,
),
date_posted=date_posted,
job_url=job_url,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
def _get_cookies(self):
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
self.session.post(f"{self.api_url}/jobs-app/event", data=data, headers=self.headers)
url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data, headers=self.headers)
@staticmethod
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
@@ -182,16 +186,13 @@ class ZipRecruiterScraper(Scraper):
"location": scraper_input.location,
}
if scraper_input.hours_old:
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
params['days'] = fromage
job_type_map = {
JobType.FULL_TIME: 'full_time',
JobType.PART_TIME: 'part_time'
}
params["days"] = max(scraper_input.hours_old // 24, 1)
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
if scraper_input.job_type:
params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0]
job_type = scraper_input.job_type
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
if scraper_input.easy_apply:
params['zipapply'] = 1
params["zipapply"] = 1
if scraper_input.is_remote:
params["remote"] = 1
if scraper_input.distance: