mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 12:04:33 -08:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
02caf1b38d | ||
|
|
8e2ab277da | ||
|
|
ce3bd84ee5 | ||
|
|
1ccf2290fe | ||
|
|
ec2eefc58a | ||
|
|
13c7694474 | ||
|
|
bbe46fe3f4 |
10
README.md
10
README.md
@@ -69,7 +69,7 @@ Optional
|
|||||||
├── is_remote (bool)
|
├── is_remote (bool)
|
||||||
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower)
|
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower)
|
||||||
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
||||||
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
|
├── easy_apply (bool): filters for jobs that are hosted on the job board site
|
||||||
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
||||||
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
|
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
|
||||||
```
|
```
|
||||||
@@ -80,6 +80,7 @@ Optional
|
|||||||
JobPost
|
JobPost
|
||||||
├── title (str)
|
├── title (str)
|
||||||
├── company (str)
|
├── company (str)
|
||||||
|
├── company_url (str)
|
||||||
├── job_url (str)
|
├── job_url (str)
|
||||||
├── location (object)
|
├── location (object)
|
||||||
│ ├── country (str)
|
│ ├── country (str)
|
||||||
@@ -158,16 +159,11 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
|
|||||||
**Q: Received a response code 429?**
|
**Q: Received a response code 429?**
|
||||||
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
|
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
|
||||||
|
|
||||||
- Waiting a few seconds between requests.
|
- Waiting some time between scrapes (site-dependent).
|
||||||
- Trying a VPN or proxy to change your IP address.
|
- Trying a VPN or proxy to change your IP address.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**Q: Experiencing a "Segmentation fault: 11" on macOS Catalina?**
|
|
||||||
**A:** This is due to `tls_client` dependency not supporting your architecture. Solutions and workarounds include:
|
|
||||||
|
|
||||||
- Upgrade to a newer version of MacOS
|
|
||||||
- Reach out to the maintainers of [tls_client](https://github.com/bogdanfinn/tls-client) for fixes
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
18
poetry.lock
generated
18
poetry.lock
generated
@@ -1053,16 +1053,6 @@ files = [
|
|||||||
{file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
|
{file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
|
||||||
{file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
|
{file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
|
||||||
{file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
|
{file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
|
|
||||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
|
{file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
|
||||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
|
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
|
||||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
|
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
|
||||||
@@ -2270,13 +2260,13 @@ test = ["flake8", "isort", "pytest"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tls-client"
|
name = "tls-client"
|
||||||
version = "0.2.1"
|
version = "1.0"
|
||||||
description = "Advanced Python HTTP Client."
|
description = "Advanced Python HTTP Client."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
{file = "tls_client-0.2.1-py3-none-any.whl", hash = "sha256:124a710952b979d5e20b4e2b7879b7958d6e48a259d0f5b83101055eb173f0bd"},
|
{file = "tls_client-1.0-py3-none-any.whl", hash = "sha256:f1183f5e18cb31914bd62d11b350a33ea0293ea80fb91d69a3072821dece3e66"},
|
||||||
{file = "tls_client-0.2.1.tar.gz", hash = "sha256:473fb4c671d9d4ca6b818548ab6e955640dd589767bfce520830c5618c2f2e2b"},
|
{file = "tls_client-1.0.tar.gz", hash = "sha256:7f6de48ad4a0ef69b72682c76ce604155971e07b4bfb2148a36276194ae3e7a0"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2445,4 +2435,4 @@ files = [
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "f966f3979873eec2c3b13460067f5aa414c69aa8ab5cd3239c1cfa564fcb5deb"
|
content-hash = "404a77d78066cbb2ef71015562baf44aa11d12aac29a191c1ccc7758bfda598a"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.36"
|
version = "1.1.41"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
@@ -13,7 +13,7 @@ packages = [
|
|||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
tls-client = "^0.2.1"
|
tls-client = "*"
|
||||||
beautifulsoup4 = "^4.12.2"
|
beautifulsoup4 = "^4.12.2"
|
||||||
pandas = "^2.1.0"
|
pandas = "^2.1.0"
|
||||||
NUMPY = "1.24.2"
|
NUMPY = "1.24.2"
|
||||||
|
|||||||
@@ -88,13 +88,14 @@ class GlassdoorScraper(Scraper):
|
|||||||
def process_job(self, job_data):
|
def process_job(self, job_data):
|
||||||
"""Processes a single job and fetches its description."""
|
"""Processes a single job and fetches its description."""
|
||||||
job_id = job_data["jobview"]["job"]["listingId"]
|
job_id = job_data["jobview"]["job"]["listingId"]
|
||||||
job_url = f'{self.url}/job-listing/?jl={job_id}'
|
job_url = f'{self.url}job-listing/j?jl={job_id}'
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
return None
|
return None
|
||||||
self.seen_urls.add(job_url)
|
self.seen_urls.add(job_url)
|
||||||
job = job_data["jobview"]
|
job = job_data["jobview"]
|
||||||
title = job["job"]["jobTitleText"]
|
title = job["job"]["jobTitleText"]
|
||||||
company_name = job["header"]["employerNameFromSearch"]
|
company_name = job["header"]["employerNameFromSearch"]
|
||||||
|
company_id = job_data['jobview']['header']['employer']['id']
|
||||||
location_name = job["header"].get("locationName", "")
|
location_name = job["header"].get("locationName", "")
|
||||||
location_type = job["header"].get("locationType", "")
|
location_type = job["header"].get("locationType", "")
|
||||||
age_in_days = job["header"].get("ageInDays")
|
age_in_days = job["header"].get("ageInDays")
|
||||||
@@ -115,6 +116,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
|
|
||||||
job_post = JobPost(
|
job_post = JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
|
company_url=f"{self.url}Overview/W-EI_IE{company_id}.htm" if company_id else None,
|
||||||
company_name=company_name,
|
company_name=company_name,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
@@ -244,6 +246,8 @@ class GlassdoorScraper(Scraper):
|
|||||||
location_type = "CITY"
|
location_type = "CITY"
|
||||||
elif location_type == "S":
|
elif location_type == "S":
|
||||||
location_type = "STATE"
|
location_type = "STATE"
|
||||||
|
elif location_type == 'N':
|
||||||
|
location_type = "COUNTRY"
|
||||||
return int(items[0]["locationId"]), location_type
|
return int(items[0]["locationId"]), location_type
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -258,7 +262,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
"operationName": "JobSearchResultsQuery",
|
"operationName": "JobSearchResultsQuery",
|
||||||
"variables": {
|
"variables": {
|
||||||
"excludeJobListingIds": [],
|
"excludeJobListingIds": [],
|
||||||
"filterParams": [],
|
"filterParams": [{"filterKey": "applicationType", "values": "1"}] if scraper_input.easy_apply else [],
|
||||||
"keyword": scraper_input.search_term,
|
"keyword": scraper_input.search_term,
|
||||||
"numJobsToShow": 30,
|
"numJobsToShow": 30,
|
||||||
"locationType": location_type,
|
"locationType": location_type,
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import re
|
|||||||
import math
|
import math
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
|
from typing import Any
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
@@ -44,7 +45,7 @@ class IndeedScraper(Scraper):
|
|||||||
site = Site(Site.INDEED)
|
site = Site(Site.INDEED)
|
||||||
super().__init__(site, proxy=proxy)
|
super().__init__(site, proxy=proxy)
|
||||||
|
|
||||||
self.jobs_per_page = 15
|
self.jobs_per_page = 25
|
||||||
self.seen_urls = set()
|
self.seen_urls = set()
|
||||||
|
|
||||||
def scrape_page(
|
def scrape_page(
|
||||||
@@ -60,30 +61,12 @@ class IndeedScraper(Scraper):
|
|||||||
domain = self.country.indeed_domain_value
|
domain = self.country.indeed_domain_value
|
||||||
self.url = f"https://{domain}.indeed.com"
|
self.url = f"https://{domain}.indeed.com"
|
||||||
|
|
||||||
params = {
|
|
||||||
"q": scraper_input.search_term,
|
|
||||||
"l": scraper_input.location,
|
|
||||||
"filter": 0,
|
|
||||||
"start": scraper_input.offset + page * 10,
|
|
||||||
"sort": "date"
|
|
||||||
}
|
|
||||||
if scraper_input.distance:
|
|
||||||
params["radius"] = scraper_input.distance
|
|
||||||
|
|
||||||
sc_values = []
|
|
||||||
if scraper_input.is_remote:
|
|
||||||
sc_values.append("attr(DSQF7)")
|
|
||||||
if scraper_input.job_type:
|
|
||||||
sc_values.append("jt({})".format(scraper_input.job_type.value))
|
|
||||||
|
|
||||||
if sc_values:
|
|
||||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
|
||||||
try:
|
try:
|
||||||
session = create_session(self.proxy)
|
session = create_session(self.proxy)
|
||||||
response = session.get(
|
response = session.get(
|
||||||
f"{self.url}/jobs",
|
f"{self.url}/m/jobs",
|
||||||
headers=self.get_headers(),
|
headers=self.get_headers(),
|
||||||
params=params,
|
params=self.add_params(scraper_input, page),
|
||||||
allow_redirects=True,
|
allow_redirects=True,
|
||||||
timeout_seconds=10,
|
timeout_seconds=10,
|
||||||
)
|
)
|
||||||
@@ -112,8 +95,8 @@ class IndeedScraper(Scraper):
|
|||||||
):
|
):
|
||||||
raise IndeedException("No jobs found.")
|
raise IndeedException("No jobs found.")
|
||||||
|
|
||||||
def process_job(job) -> JobPost | None:
|
def process_job(job: dict) -> JobPost | None:
|
||||||
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
|
job_url = f'{self.url}/m/jobs/viewjob?jk={job["jobkey"]}'
|
||||||
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
|
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
return None
|
return None
|
||||||
@@ -171,8 +154,9 @@ class IndeedScraper(Scraper):
|
|||||||
)
|
)
|
||||||
return job_post
|
return job_post
|
||||||
|
|
||||||
|
workers = 10 if scraper_input.full_description else 10 # possibly lessen 10 when fetching desc based on feedback
|
||||||
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||||
job_results: list[Future] = [
|
job_results: list[Future] = [
|
||||||
executor.submit(process_job, job) for job in jobs
|
executor.submit(process_job, job) for job in jobs
|
||||||
]
|
]
|
||||||
@@ -194,7 +178,7 @@ class IndeedScraper(Scraper):
|
|||||||
#: get first page to initialize session
|
#: get first page to initialize session
|
||||||
job_list, total_results = self.scrape_page(scraper_input, 0)
|
job_list, total_results = self.scrape_page(scraper_input, 0)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
futures: list[Future] = [
|
futures: list[Future] = [
|
||||||
executor.submit(self.scrape_page, scraper_input, page)
|
executor.submit(self.scrape_page, scraper_input, page)
|
||||||
for page in range(1, pages_to_process + 1)
|
for page in range(1, pages_to_process + 1)
|
||||||
@@ -223,7 +207,7 @@ class IndeedScraper(Scraper):
|
|||||||
parsed_url = urllib.parse.urlparse(job_page_url)
|
parsed_url = urllib.parse.urlparse(job_page_url)
|
||||||
params = urllib.parse.parse_qs(parsed_url.query)
|
params = urllib.parse.parse_qs(parsed_url.query)
|
||||||
jk_value = params.get("jk", [None])[0]
|
jk_value = params.get("jk", [None])[0]
|
||||||
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
|
formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
|
||||||
session = create_session(self.proxy)
|
session = create_session(self.proxy)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -240,10 +224,18 @@ class IndeedScraper(Scraper):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(response.text)
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
job_description = data["body"]["jobInfoWrapperModel"]["jobInfoModel"][
|
script_tags = soup.find_all('script')
|
||||||
"sanitizedJobDescription"
|
|
||||||
]
|
job_description = ''
|
||||||
|
for tag in script_tags:
|
||||||
|
if 'window._initialData' in tag.text:
|
||||||
|
json_str = tag.text
|
||||||
|
json_str = json_str.split('window._initialData=')[1]
|
||||||
|
json_str = json_str.rsplit(';', 1)[0]
|
||||||
|
data = json.loads(json_str)
|
||||||
|
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
|
||||||
|
break
|
||||||
except (KeyError, TypeError, IndexError):
|
except (KeyError, TypeError, IndexError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -331,17 +323,14 @@ class IndeedScraper(Scraper):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def get_headers():
|
def get_headers():
|
||||||
return {
|
return {
|
||||||
"authority": "www.indeed.com",
|
'Host': 'www.indeed.com',
|
||||||
"accept": "*/*",
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
"accept-language": "en-US,en;q=0.9",
|
'sec-fetch-site': 'same-origin',
|
||||||
"referer": "https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw",
|
'sec-fetch-dest': 'document',
|
||||||
"sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
'accept-language': 'en-US,en;q=0.9',
|
||||||
"sec-ch-ua-mobile": "?0",
|
'sec-fetch-mode': 'navigate',
|
||||||
"sec-ch-ua-platform": '"Windows"',
|
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0',
|
||||||
"sec-fetch-dest": "empty",
|
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -354,3 +343,29 @@ class IndeedScraper(Scraper):
|
|||||||
if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0:
|
if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
|
||||||
|
params = {
|
||||||
|
"q": scraper_input.search_term,
|
||||||
|
"l": scraper_input.location,
|
||||||
|
"filter": 0,
|
||||||
|
"start": scraper_input.offset + page * 10,
|
||||||
|
"sort": "date"
|
||||||
|
}
|
||||||
|
if scraper_input.distance:
|
||||||
|
params["radius"] = scraper_input.distance
|
||||||
|
|
||||||
|
sc_values = []
|
||||||
|
if scraper_input.is_remote:
|
||||||
|
sc_values.append("attr(DSQF7)")
|
||||||
|
if scraper_input.job_type:
|
||||||
|
sc_values.append("jt({})".format(scraper_input.job_type.value))
|
||||||
|
|
||||||
|
if sc_values:
|
||||||
|
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||||
|
|
||||||
|
if scraper_input.easy_apply:
|
||||||
|
params['iafilter'] = 1
|
||||||
|
|
||||||
|
return params
|
||||||
|
|||||||
@@ -6,8 +6,7 @@ This module contains routines to scrape ZipRecruiter.
|
|||||||
"""
|
"""
|
||||||
import math
|
import math
|
||||||
import time
|
import time
|
||||||
import re
|
from datetime import datetime, timezone
|
||||||
from datetime import datetime, date
|
|
||||||
from typing import Optional, Tuple, Any
|
from typing import Optional, Tuple, Any
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -44,12 +43,12 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
"""
|
"""
|
||||||
params = self.add_params(scraper_input)
|
params = self.add_params(scraper_input)
|
||||||
if continue_token:
|
if continue_token:
|
||||||
params["continue"] = continue_token
|
params["continue_from"] = continue_token
|
||||||
try:
|
try:
|
||||||
response = self.session.get(
|
response = self.session.get(
|
||||||
f"https://api.ziprecruiter.com/jobs-app/jobs",
|
f"https://api.ziprecruiter.com/jobs-app/jobs",
|
||||||
headers=self.headers(),
|
headers=self.headers(),
|
||||||
params=self.add_params(scraper_input),
|
params=params
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
raise ZipRecruiterException(
|
raise ZipRecruiterException(
|
||||||
@@ -68,7 +67,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
||||||
job_results = [executor.submit(self.process_job, job) for job in jobs_list]
|
job_results = [executor.submit(self.process_job, job) for job in jobs_list]
|
||||||
|
|
||||||
job_list = [result.result() for result in job_results if result.result()]
|
job_list = list(filter(None, (result.result() for result in job_results)))
|
||||||
return job_list, next_continue_token
|
return job_list, next_continue_token
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
@@ -95,16 +94,15 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
if not continue_token:
|
if not continue_token:
|
||||||
break
|
break
|
||||||
|
|
||||||
if len(job_list) > scraper_input.results_wanted:
|
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
|
||||||
|
|
||||||
return JobResponse(jobs=job_list)
|
def process_job(self, job: dict) -> JobPost | None:
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def process_job(job: dict) -> JobPost:
|
|
||||||
"""Processes an individual job dict from the response"""
|
"""Processes an individual job dict from the response"""
|
||||||
title = job.get("name")
|
title = job.get("name")
|
||||||
job_url = job.get("job_url")
|
job_url = f"https://www.ziprecruiter.com/jobs//j?lvk={job['listing_key']}"
|
||||||
|
if job_url in self.seen_urls:
|
||||||
|
return
|
||||||
|
self.seen_urls.add(job_url)
|
||||||
|
|
||||||
job_description_html = job.get("job_description", "").strip()
|
job_description_html = job.get("job_description", "").strip()
|
||||||
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
||||||
@@ -120,17 +118,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
job_type = ZipRecruiterScraper.get_job_type_enum(
|
job_type = ZipRecruiterScraper.get_job_type_enum(
|
||||||
job.get("employment_type", "").replace("_", "").lower()
|
job.get("employment_type", "").replace("_", "").lower()
|
||||||
)
|
)
|
||||||
|
date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date()
|
||||||
save_job_url = job.get("SaveJobURL", "")
|
|
||||||
posted_time_match = re.search(
|
|
||||||
r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url
|
|
||||||
)
|
|
||||||
if posted_time_match:
|
|
||||||
date_time_str = posted_time_match.group(1)
|
|
||||||
date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ")
|
|
||||||
date_posted = date_posted_obj.date()
|
|
||||||
else:
|
|
||||||
date_posted = date.today()
|
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
@@ -173,7 +161,6 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
params = {
|
params = {
|
||||||
"search": scraper_input.search_term,
|
"search": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
"form": "jobs-landing",
|
|
||||||
}
|
}
|
||||||
job_type_value = None
|
job_type_value = None
|
||||||
if scraper_input.job_type:
|
if scraper_input.job_type:
|
||||||
@@ -183,6 +170,8 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
job_type_value = "part_time"
|
job_type_value = "part_time"
|
||||||
else:
|
else:
|
||||||
job_type_value = scraper_input.job_type.value
|
job_type_value = scraper_input.job_type.value
|
||||||
|
if scraper_input.easy_apply:
|
||||||
|
params['zipapply'] = 1
|
||||||
|
|
||||||
if job_type_value:
|
if job_type_value:
|
||||||
params[
|
params[
|
||||||
|
|||||||
Reference in New Issue
Block a user