fix job type search (#106)

This commit is contained in:
Cullen Watson
2024-02-12 11:02:48 -06:00
committed by GitHub
parent 91b137ef86
commit aeb1a50d2c
7 changed files with 220 additions and 137 deletions

View File

@@ -11,7 +11,6 @@ import requests
from typing import Any
from datetime import datetime
import urllib.parse
from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
@@ -22,7 +21,7 @@ from ..utils import (
extract_emails_from_text,
create_session,
get_enum_from_job_type,
modify_and_get_description
logger
)
from ...jobs import (
JobPost,
@@ -50,13 +49,14 @@ class IndeedScraper(Scraper):
def scrape_page(
self, scraper_input: ScraperInput, page: int
) -> tuple[list[JobPost], int]:
) -> list[JobPost]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param scraper_input:
:param page:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
self.country = scraper_input.country
domain = self.country.indeed_domain_value
self.url = f"https://{domain}.indeed.com"
@@ -76,14 +76,14 @@ class IndeedScraper(Scraper):
)
except Exception as e:
if "Proxy responded with" in str(e):
raise IndeedException("bad proxy")
raise IndeedException(str(e))
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return job_list
soup = BeautifulSoup(response.content, "html.parser")
job_list = []
total_num_jobs = IndeedScraper.total_jobs(soup)
if "did not match any jobs" in response.text:
return job_list, total_num_jobs
return job_list
jobs = IndeedScraper.parse_jobs(
soup
@@ -145,7 +145,7 @@ class IndeedScraper(Scraper):
job_list = [result.result() for result in job_results if result.result()]
return job_list, total_num_jobs
return job_list
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@@ -153,7 +153,7 @@ class IndeedScraper(Scraper):
:param scraper_input:
:return: job_response
"""
job_list, total_results = self.scrape_page(scraper_input, 0)
job_list = self.scrape_page(scraper_input, 0)
pages_processed = 1
while len(self.seen_urls) < scraper_input.results_wanted:
@@ -167,7 +167,7 @@ class IndeedScraper(Scraper):
]
for future in futures:
jobs, _ = future.result()
jobs = future.result()
if jobs:
job_list += jobs
new_jobs = True
@@ -182,55 +182,7 @@ class IndeedScraper(Scraper):
if len(self.seen_urls) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted]
job_response = JobResponse(
jobs=job_list,
total_results=total_results,
)
return job_response
def get_description(self, job_page_url: str) -> str | None:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description
"""
parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
session = create_session(self.proxy)
try:
response = session.get(
formatted_url,
headers=self.get_headers(),
allow_redirects=True,
timeout_seconds=5,
)
except Exception as e:
return None
if response.status_code not in range(200, 400):
return None
try:
soup = BeautifulSoup(response.text, 'html.parser')
script_tags = soup.find_all('script')
job_description = ''
for tag in script_tags:
if 'window._initialData' in tag.text:
json_str = tag.text
json_str = json_str.split('window._initialData=')[1]
json_str = json_str.rsplit(';', 1)[0]
data = json.loads(json_str)
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
break
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(job_description, "html.parser")
return modify_and_get_description(soup)
return JobResponse(jobs=job_list)
@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:
@@ -330,24 +282,6 @@ class IndeedScraper(Scraper):
"Could not find any results for the search"
)
@staticmethod
def total_jobs(soup: BeautifulSoup) -> int:
"""
Parses the total jobs for that search from soup object
:param soup:
:return: total_num_jobs
"""
script = soup.find("script", string=lambda t: t and "window._initialData" in t)
pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL)
match = pattern.search(script.string)
total_num_jobs = 0
if match:
json_str = match.group(1)
data = json.loads(json_str)
total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
return total_num_jobs
@staticmethod
def get_headers():
return {
@@ -380,7 +314,7 @@ class IndeedScraper(Scraper):
if scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if scraper_input.job_type:
sc_values.append("jt({})".format(scraper_input.job_type.value))
sc_values.append("jt({})".format(scraper_input.job_type.value[0]))
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
@@ -406,7 +340,7 @@ class IndeedScraper(Scraper):
taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
for taxonomy in job.get("taxonomyAttributes", [])
)
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy
def get_job_details(self, job_keys: list[str]) -> dict:
"""