mirror of https://github.com/Bunsly/JobSpy
minor
commit
df70d4bc2e
|
@ -1,22 +0,0 @@
|
||||||
name: Python Tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v2
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v2
|
|
||||||
with:
|
|
||||||
python-version: '3.8'
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install poetry
|
|
||||||
poetry install
|
|
||||||
- name: Run tests
|
|
||||||
run: poetry run pytest tests/test_all.py
|
|
|
@ -42,7 +42,7 @@ def scrape_jobs(
|
||||||
offset: int | None = 0,
|
offset: int | None = 0,
|
||||||
hours_old: int = None,
|
hours_old: int = None,
|
||||||
enforce_annual_salary: bool = False,
|
enforce_annual_salary: bool = False,
|
||||||
verbose: int = 2,
|
verbose: int = 0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,19 +1,22 @@
|
||||||
|
"""
|
||||||
|
jobspy.scrapers.bayt
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module contains routines to scrape Bayt.
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import time
|
|
||||||
import random
|
import random
|
||||||
from typing import Optional
|
import time
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import BaytException
|
from ..utils import create_logger, create_session
|
||||||
from ...jobs import JobPost, JobResponse, Location, Country
|
from ...jobs import JobPost, JobResponse, Location, Country
|
||||||
from ..utils import create_logger
|
|
||||||
|
|
||||||
logger = create_logger("Bayt")
|
log = create_logger("Bayt")
|
||||||
logger.setLevel("DEBUG") # Ensure DEBUG messages are output
|
|
||||||
|
|
||||||
|
|
||||||
class BaytScraper(Scraper):
|
class BaytScraper(Scraper):
|
||||||
|
@ -26,10 +29,14 @@ class BaytScraper(Scraper):
|
||||||
):
|
):
|
||||||
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
|
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
|
self.session = None
|
||||||
self.country = "worldwide"
|
self.country = "worldwide"
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
|
self.session = create_session(
|
||||||
|
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
|
||||||
|
)
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
page = 1
|
page = 1
|
||||||
results_wanted = (
|
results_wanted = (
|
||||||
|
@ -37,13 +44,15 @@ class BaytScraper(Scraper):
|
||||||
)
|
)
|
||||||
|
|
||||||
while len(job_list) < results_wanted:
|
while len(job_list) < results_wanted:
|
||||||
logger.info(f"Fetching Bayt jobs page {page}")
|
log.info(f"Fetching Bayt jobs page {page}")
|
||||||
job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
|
job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
|
||||||
if not job_elements:
|
if not job_elements:
|
||||||
break
|
break
|
||||||
|
|
||||||
if job_elements:
|
if job_elements:
|
||||||
logger.debug("First job element snippet:\n" + job_elements[0].prettify()[:500])
|
log.debug(
|
||||||
|
"First job element snippet:\n" + job_elements[0].prettify()[:500]
|
||||||
|
)
|
||||||
|
|
||||||
initial_count = len(job_list)
|
initial_count = len(job_list)
|
||||||
for job in job_elements:
|
for job in job_elements:
|
||||||
|
@ -54,16 +63,16 @@ class BaytScraper(Scraper):
|
||||||
if len(job_list) >= results_wanted:
|
if len(job_list) >= results_wanted:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
logger.debug(
|
log.debug(
|
||||||
"Extraction returned None. Job snippet:\n"
|
"Extraction returned None. Job snippet:\n"
|
||||||
+ job.prettify()[:500]
|
+ job.prettify()[:500]
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Bayt: Error extracting job info: {str(e)}")
|
log.error(f"Bayt: Error extracting job info: {str(e)}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if len(job_list) == initial_count:
|
if len(job_list) == initial_count:
|
||||||
logger.info(f"No new jobs found on page {page}. Ending pagination.")
|
log.info(f"No new jobs found on page {page}. Ending pagination.")
|
||||||
break
|
break
|
||||||
|
|
||||||
page += 1
|
page += 1
|
||||||
|
@ -72,45 +81,35 @@ class BaytScraper(Scraper):
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def _fetch_jobs(self, query: str, page: int = 1) -> Optional[list]:
|
def _fetch_jobs(self, query: str, page: int) -> list | None:
|
||||||
"""
|
"""
|
||||||
Grabs the job results for the given query and page number.
|
Grabs the job results for the given query and page number.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Updated URL to include the "international" segment as per the original code.
|
|
||||||
url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
|
url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
|
||||||
logger.info(f"Constructed URL: {url}")
|
response = self.session.get(url)
|
||||||
headers = {
|
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
||||||
"Chrome/115.0.0.0 Safari/537.36"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
# Use the attribute selector as in the original code.
|
|
||||||
job_listings = soup.find_all("li", attrs={"data-js-job": ""})
|
job_listings = soup.find_all("li", attrs={"data-js-job": ""})
|
||||||
logger.info(f"Found {len(job_listings)} job listing elements")
|
log.debug(f"Found {len(job_listings)} job listing elements")
|
||||||
return job_listings
|
return job_listings
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Bayt: Error fetching jobs - {str(e)}")
|
log.error(f"Bayt: Error fetching jobs - {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]:
|
def _extract_job_info(self, job: BeautifulSoup) -> JobPost | None:
|
||||||
"""
|
"""
|
||||||
Extracts the job information from a single job listing.
|
Extracts the job information from a single job listing.
|
||||||
"""
|
"""
|
||||||
# Find the h2 element holding the title and link (no class filtering)
|
# Find the h2 element holding the title and link (no class filtering)
|
||||||
job_general_information = job.find("h2")
|
job_general_information = job.find("h2")
|
||||||
if not job_general_information:
|
if not job_general_information:
|
||||||
return None
|
return
|
||||||
|
|
||||||
job_title = job_general_information.get_text(strip=True)
|
job_title = job_general_information.get_text(strip=True)
|
||||||
job_url = self._extract_job_url(job_general_information)
|
job_url = self._extract_job_url(job_general_information)
|
||||||
if not job_url:
|
if not job_url:
|
||||||
return None
|
return
|
||||||
|
|
||||||
# Extract company name using the original approach:
|
# Extract company name using the original approach:
|
||||||
company_tag = job.find("div", class_="t-nowrap p10l")
|
company_tag = job.find("div", class_="t-nowrap p10l")
|
||||||
|
@ -129,31 +128,18 @@ class BaytScraper(Scraper):
|
||||||
city=location,
|
city=location,
|
||||||
country=Country.from_string(self.country),
|
country=Country.from_string(self.country),
|
||||||
)
|
)
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=job_id,
|
id=job_id,
|
||||||
title=job_title,
|
title=job_title,
|
||||||
company_name=company_name,
|
company_name=company_name,
|
||||||
company_url="",
|
|
||||||
location=location_obj,
|
location=location_obj,
|
||||||
date_posted=None,
|
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
compensation=None,
|
|
||||||
job_type=None,
|
|
||||||
job_level=None,
|
|
||||||
company_industry=None,
|
|
||||||
description=None,
|
|
||||||
job_url_direct=None,
|
|
||||||
emails=[],
|
|
||||||
company_logo=None,
|
|
||||||
job_function=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]:
|
def _extract_job_url(self, job_general_information: BeautifulSoup) -> str | None:
|
||||||
"""
|
"""
|
||||||
Pulls the job URL from the 'a' within the h2 element.
|
Pulls the job URL from the 'a' within the h2 element.
|
||||||
"""
|
"""
|
||||||
a_tag = job_general_information.find("a")
|
a_tag = job_general_information.find("a")
|
||||||
if a_tag and a_tag.has_attr("href"):
|
if a_tag and a_tag.has_attr("href"):
|
||||||
return self.base_url + a_tag["href"].strip()
|
return self.base_url + a_tag["href"].strip()
|
||||||
return None
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ class GoogleJobsException(Exception):
|
||||||
def __init__(self, message=None):
|
def __init__(self, message=None):
|
||||||
super().__init__(message or "An error occurred with Google Jobs")
|
super().__init__(message or "An error occurred with Google Jobs")
|
||||||
|
|
||||||
|
|
||||||
class BaytException(Exception):
|
class BaytException(Exception):
|
||||||
def __init__(self, message=None):
|
def __init__(self, message=None):
|
||||||
super().__init__(message or "An error occurred with Bayt")
|
super().__init__(message or "An error occurred with Bayt")
|
||||||
|
|
|
@ -32,7 +32,7 @@ from ...jobs import (
|
||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = create_logger("Glassdoor")
|
log = create_logger("Glassdoor")
|
||||||
|
|
||||||
|
|
||||||
class GlassdoorScraper(Scraper):
|
class GlassdoorScraper(Scraper):
|
||||||
|
@ -64,7 +64,7 @@ class GlassdoorScraper(Scraper):
|
||||||
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
||||||
|
|
||||||
self.session = create_session(
|
self.session = create_session(
|
||||||
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True
|
proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True
|
||||||
)
|
)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
headers["gd-csrf-token"] = token if token else fallback_token
|
headers["gd-csrf-token"] = token if token else fallback_token
|
||||||
|
@ -74,7 +74,7 @@ class GlassdoorScraper(Scraper):
|
||||||
scraper_input.location, scraper_input.is_remote
|
scraper_input.location, scraper_input.is_remote
|
||||||
)
|
)
|
||||||
if location_type is None:
|
if location_type is None:
|
||||||
logger.error("Glassdoor: location not parsed")
|
log.error("Glassdoor: location not parsed")
|
||||||
return JobResponse(jobs=[])
|
return JobResponse(jobs=[])
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
cursor = None
|
cursor = None
|
||||||
|
@ -83,7 +83,7 @@ class GlassdoorScraper(Scraper):
|
||||||
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
||||||
range_end = min(tot_pages, self.max_pages + 1)
|
range_end = min(tot_pages, self.max_pages + 1)
|
||||||
for page in range(range_start, range_end):
|
for page in range(range_start, range_end):
|
||||||
logger.info(f"search page: {page} / {range_end-1}")
|
log.info(f"search page: {page} / {range_end - 1}")
|
||||||
try:
|
try:
|
||||||
jobs, cursor = self._fetch_jobs_page(
|
jobs, cursor = self._fetch_jobs_page(
|
||||||
scraper_input, location_id, location_type, page, cursor
|
scraper_input, location_id, location_type, page, cursor
|
||||||
|
@ -93,7 +93,7 @@ class GlassdoorScraper(Scraper):
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Glassdoor: {str(e)}")
|
log.error(f"Glassdoor: {str(e)}")
|
||||||
break
|
break
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
|
@ -129,7 +129,7 @@ class GlassdoorScraper(Scraper):
|
||||||
ValueError,
|
ValueError,
|
||||||
Exception,
|
Exception,
|
||||||
) as e:
|
) as e:
|
||||||
logger.error(f"Glassdoor: {str(e)}")
|
log.error(f"Glassdoor: {str(e)}")
|
||||||
return jobs, None
|
return jobs, None
|
||||||
|
|
||||||
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
||||||
|
@ -264,12 +264,12 @@ class GlassdoorScraper(Scraper):
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
||||||
logger.error(err)
|
log.error(err)
|
||||||
return None, None
|
return None, None
|
||||||
else:
|
else:
|
||||||
err = f"Glassdoor response status code {res.status_code}"
|
err = f"Glassdoor response status code {res.status_code}"
|
||||||
err += f" - {res.text}"
|
err += f" - {res.text}"
|
||||||
logger.error(f"Glassdoor response status code {res.status_code}")
|
log.error(f"Glassdoor response status code {res.status_code}")
|
||||||
return None, None
|
return None, None
|
||||||
items = res.json()
|
items = res.json()
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ from ...jobs import (
|
||||||
JobType,
|
JobType,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = create_logger("Google")
|
log = create_logger("Google")
|
||||||
|
|
||||||
|
|
||||||
class GoogleJobsScraper(Scraper):
|
class GoogleJobsScraper(Scraper):
|
||||||
|
@ -61,7 +61,7 @@ class GoogleJobsScraper(Scraper):
|
||||||
)
|
)
|
||||||
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
|
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
|
||||||
if forward_cursor is None:
|
if forward_cursor is None:
|
||||||
logger.warning(
|
log.warning(
|
||||||
"initial cursor not found, try changing your query or there was at most 10 results"
|
"initial cursor not found, try changing your query or there was at most 10 results"
|
||||||
)
|
)
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
@ -72,16 +72,16 @@ class GoogleJobsScraper(Scraper):
|
||||||
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
||||||
and forward_cursor
|
and forward_cursor
|
||||||
):
|
):
|
||||||
logger.info(
|
log.info(
|
||||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to get jobs on page: {page}, {e}")
|
log.error(f"failed to get jobs on page: {page}, {e}")
|
||||||
break
|
break
|
||||||
if not jobs:
|
if not jobs:
|
||||||
logger.info(f"found no jobs on page: {page}")
|
log.info(f"found no jobs on page: {page}")
|
||||||
break
|
break
|
||||||
job_list += jobs
|
job_list += jobs
|
||||||
page += 1
|
page += 1
|
||||||
|
@ -230,10 +230,7 @@ class GoogleJobsScraper(Scraper):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _find_job_info_initial_page(html_text: str):
|
def _find_job_info_initial_page(html_text: str):
|
||||||
pattern = (
|
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
|
||||||
f'520084652":('
|
|
||||||
+ r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
|
|
||||||
)
|
|
||||||
results = []
|
results = []
|
||||||
matches = re.finditer(pattern, html_text)
|
matches = re.finditer(pattern, html_text)
|
||||||
|
|
||||||
|
@ -245,6 +242,6 @@ class GoogleJobsScraper(Scraper):
|
||||||
results.append(parsed_data)
|
results.append(parsed_data)
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
logger.error(f"Failed to parse match: {str(e)}")
|
log.error(f"Failed to parse match: {str(e)}")
|
||||||
results.append({"raw_match": match.group(0), "error": str(e)})
|
results.append({"raw_match": match.group(0), "error": str(e)})
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -30,7 +30,7 @@ from ...jobs import (
|
||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = create_logger("Indeed")
|
log = create_logger("Indeed")
|
||||||
|
|
||||||
|
|
||||||
class IndeedScraper(Scraper):
|
class IndeedScraper(Scraper):
|
||||||
|
@ -71,12 +71,12 @@ class IndeedScraper(Scraper):
|
||||||
cursor = None
|
cursor = None
|
||||||
|
|
||||||
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
||||||
logger.info(
|
log.info(
|
||||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||||
)
|
)
|
||||||
jobs, cursor = self._scrape_page(cursor)
|
jobs, cursor = self._scrape_page(cursor)
|
||||||
if not jobs:
|
if not jobs:
|
||||||
logger.info(f"found no jobs on page: {page}")
|
log.info(f"found no jobs on page: {page}")
|
||||||
break
|
break
|
||||||
job_list += jobs
|
job_list += jobs
|
||||||
page += 1
|
page += 1
|
||||||
|
@ -122,9 +122,10 @@ class IndeedScraper(Scraper):
|
||||||
headers=api_headers_temp,
|
headers=api_headers_temp,
|
||||||
json=payload,
|
json=payload,
|
||||||
timeout=10,
|
timeout=10,
|
||||||
|
verify=False,
|
||||||
)
|
)
|
||||||
if not response.ok:
|
if not response.ok:
|
||||||
logger.info(
|
log.info(
|
||||||
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
|
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
|
||||||
)
|
)
|
||||||
return jobs, new_cursor
|
return jobs, new_cursor
|
||||||
|
|
|
@ -38,7 +38,7 @@ from ..utils import (
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = create_logger("LinkedIn")
|
log = create_logger("LinkedIn")
|
||||||
|
|
||||||
|
|
||||||
class LinkedInScraper(Scraper):
|
class LinkedInScraper(Scraper):
|
||||||
|
@ -86,7 +86,7 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
while continue_search():
|
while continue_search():
|
||||||
request_count += 1
|
request_count += 1
|
||||||
logger.info(
|
log.info(
|
||||||
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
|
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
|
||||||
)
|
)
|
||||||
params = {
|
params = {
|
||||||
|
@ -126,13 +126,13 @@ class LinkedInScraper(Scraper):
|
||||||
else:
|
else:
|
||||||
err = f"LinkedIn response status code {response.status_code}"
|
err = f"LinkedIn response status code {response.status_code}"
|
||||||
err += f" - {response.text}"
|
err += f" - {response.text}"
|
||||||
logger.error(err)
|
log.error(err)
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "Proxy responded with" in str(e):
|
if "Proxy responded with" in str(e):
|
||||||
logger.error(f"LinkedIn: Bad proxy")
|
log.error(f"LinkedIn: Bad proxy")
|
||||||
else:
|
else:
|
||||||
logger.error(f"LinkedIn: {str(e)}")
|
log.error(f"LinkedIn: {str(e)}")
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
|
@ -1,17 +1,20 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from itertools import cycle
|
from itertools import cycle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
import tls_client
|
import tls_client
|
||||||
import numpy as np
|
import urllib3
|
||||||
from markdownify import markdownify as md
|
from markdownify import markdownify as md
|
||||||
from requests.adapters import HTTPAdapter, Retry
|
from requests.adapters import HTTPAdapter, Retry
|
||||||
|
|
||||||
from ..jobs import CompensationInterval, JobType
|
from ..jobs import CompensationInterval, JobType
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
|
||||||
def create_logger(name: str):
|
def create_logger(name: str):
|
||||||
logger = logging.getLogger(f"JobSpy:{name}")
|
logger = logging.getLogger(f"JobSpy:{name}")
|
||||||
|
@ -129,7 +132,7 @@ def create_session(
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
def set_logger_level(verbose: int = 2):
|
def set_logger_level(verbose: int):
|
||||||
"""
|
"""
|
||||||
Adjusts the logger's level. This function allows the logging level to be changed at runtime.
|
Adjusts the logger's level. This function allows the logging level to be changed at runtime.
|
||||||
|
|
||||||
|
|
|
@ -11,11 +11,10 @@ import json
|
||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Tuple, Any
|
from typing import Optional, Tuple, Any
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .constants import headers
|
from .constants import headers
|
||||||
|
@ -37,7 +36,7 @@ from ...jobs import (
|
||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = create_logger("ZipRecruiter")
|
log = create_logger("ZipRecruiter")
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
|
@ -77,7 +76,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
break
|
break
|
||||||
if page > 1:
|
if page > 1:
|
||||||
time.sleep(self.delay)
|
time.sleep(self.delay)
|
||||||
logger.info(f"search page: {page} / {max_pages}")
|
log.info(f"search page: {page} / {max_pages}")
|
||||||
jobs_on_page, continue_token = self._find_jobs_in_page(
|
jobs_on_page, continue_token = self._find_jobs_in_page(
|
||||||
scraper_input, continue_token
|
scraper_input, continue_token
|
||||||
)
|
)
|
||||||
|
@ -110,13 +109,13 @@ class ZipRecruiterScraper(Scraper):
|
||||||
else:
|
else:
|
||||||
err = f"ZipRecruiter response status code {res.status_code}"
|
err = f"ZipRecruiter response status code {res.status_code}"
|
||||||
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
|
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
|
||||||
logger.error(err)
|
log.error(err)
|
||||||
return jobs_list, ""
|
return jobs_list, ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "Proxy responded with" in str(e):
|
if "Proxy responded with" in str(e):
|
||||||
logger.error(f"Indeed: Bad proxy")
|
log.error(f"Indeed: Bad proxy")
|
||||||
else:
|
else:
|
||||||
logger.error(f"Indeed: {str(e)}")
|
log.error(f"Indeed: {str(e)}")
|
||||||
return jobs_list, ""
|
return jobs_list, ""
|
||||||
|
|
||||||
res_data = res.json()
|
res_data = res.json()
|
||||||
|
@ -215,7 +214,28 @@ class ZipRecruiterScraper(Scraper):
|
||||||
return description_full, job_url_direct
|
return description_full, job_url_direct
|
||||||
|
|
||||||
def _get_cookies(self):
|
def _get_cookies(self):
|
||||||
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
"""
|
||||||
|
Sends a session event to the API with device properties.
|
||||||
|
"""
|
||||||
|
data = [
|
||||||
|
("event_type", "session"),
|
||||||
|
("logged_in", "false"),
|
||||||
|
("number_of_retry", "1"),
|
||||||
|
("property", "model:iPhone"),
|
||||||
|
("property", "os:iOS"),
|
||||||
|
("property", "locale:en_us"),
|
||||||
|
("property", "app_build_number:4734"),
|
||||||
|
("property", "app_version:91.0"),
|
||||||
|
("property", "manufacturer:Apple"),
|
||||||
|
("property", "timestamp:2025-01-12T12:04:42-06:00"),
|
||||||
|
("property", "screen_height:852"),
|
||||||
|
("property", "os_version:16.6.1"),
|
||||||
|
("property", "source:install"),
|
||||||
|
("property", "screen_width:393"),
|
||||||
|
("property", "device_model:iPhone 14 Pro"),
|
||||||
|
("property", "brand:Apple"),
|
||||||
|
]
|
||||||
|
|
||||||
url = f"{self.api_url}/jobs-app/event"
|
url = f"{self.api_url}/jobs-app/event"
|
||||||
self.session.post(url, data=data)
|
self.session.post(url, data=data)
|
||||||
|
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
from jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_all():
|
|
||||||
sites = [
|
|
||||||
"indeed",
|
|
||||||
"glassdoor",
|
|
||||||
] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name=sites,
|
|
||||||
search_term="engineer",
|
|
||||||
results_wanted=5,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
|
@ -1,13 +0,0 @@
|
||||||
from jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_glassdoor():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="glassdoor",
|
|
||||||
search_term="engineer",
|
|
||||||
results_wanted=5,
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and len(result) == 5
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
|
@ -1,12 +0,0 @@
|
||||||
from jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_google():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="google", search_term="software engineer", results_wanted=5
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and len(result) == 5
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
|
@ -1,13 +0,0 @@
|
||||||
from jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_indeed():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="indeed",
|
|
||||||
search_term="engineer",
|
|
||||||
results_wanted=5,
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and len(result) == 5
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
|
@ -1,9 +0,0 @@
|
||||||
from jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_linkedin():
|
|
||||||
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and len(result) == 5
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
|
@ -1,12 +0,0 @@
|
||||||
from jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_ziprecruiter():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and len(result) == 5
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
Loading…
Reference in New Issue