mirror of https://github.com/Bunsly/JobSpy
fix(indeed): use new session obj per request
parent
a4d08ffb77
commit
75e3f73ba3
|
@ -123,6 +123,7 @@ def scrape_jobs(
|
||||||
job_data["job_type"] = ", ".join(job_type.value[0] for job_type in job_data["job_type"]) if job_data[
|
job_data["job_type"] = ", ".join(job_type.value[0] for job_type in job_data["job_type"]) if job_data[
|
||||||
"job_type"] else None
|
"job_type"] else None
|
||||||
|
|
||||||
|
|
||||||
job_data["location"] = Location(**job_data["location"]).display_location()
|
job_data["location"] = Location(**job_data["location"]).display_location()
|
||||||
|
|
||||||
compensation_obj = job_data.get("compensation")
|
compensation_obj = job_data.get("compensation")
|
||||||
|
|
|
@ -9,7 +9,6 @@ import math
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import tls_client
|
import tls_client
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
@ -31,7 +30,7 @@ from .. import Scraper, ScraperInput, Site
|
||||||
|
|
||||||
|
|
||||||
class IndeedScraper(Scraper):
|
class IndeedScraper(Scraper):
|
||||||
def __init__(self, proxy: Optional[str] = None):
|
def __init__(self, proxy: str | None = None):
|
||||||
"""
|
"""
|
||||||
Initializes IndeedScraper with the Indeed job search url
|
Initializes IndeedScraper with the Indeed job search url
|
||||||
"""
|
"""
|
||||||
|
@ -44,20 +43,18 @@ class IndeedScraper(Scraper):
|
||||||
self.seen_urls = set()
|
self.seen_urls = set()
|
||||||
|
|
||||||
def scrape_page(
|
def scrape_page(
|
||||||
self, scraper_input: ScraperInput, page: int, session: tls_client.Session
|
self, scraper_input: ScraperInput, page: int
|
||||||
) -> tuple[list[JobPost], int]:
|
) -> tuple[list[JobPost], int]:
|
||||||
"""
|
"""
|
||||||
Scrapes a page of Indeed for jobs with scraper_input criteria
|
Scrapes a page of Indeed for jobs with scraper_input criteria
|
||||||
:param scraper_input:
|
:param scraper_input:
|
||||||
:param page:
|
:param page:
|
||||||
:param session:
|
|
||||||
:return: jobs found on page, total number of jobs found for search
|
:return: jobs found on page, total number of jobs found for search
|
||||||
"""
|
"""
|
||||||
self.country = scraper_input.country
|
self.country = scraper_input.country
|
||||||
domain = self.country.domain_value
|
domain = self.country.domain_value
|
||||||
self.url = f"https://{domain}.indeed.com"
|
self.url = f"https://{domain}.indeed.com"
|
||||||
|
session = self.create_session()
|
||||||
job_list: list[JobPost] = []
|
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
"q": scraper_input.search_term,
|
"q": scraper_input.search_term,
|
||||||
|
@ -79,9 +76,9 @@ class IndeedScraper(Scraper):
|
||||||
try:
|
try:
|
||||||
response = session.get(
|
response = session.get(
|
||||||
f"{self.url}/jobs",
|
f"{self.url}/jobs",
|
||||||
|
headers=self.get_headers(),
|
||||||
params=params,
|
params=params,
|
||||||
allow_redirects=True,
|
allow_redirects=True,
|
||||||
proxy=self.proxy,
|
|
||||||
timeout_seconds=10,
|
timeout_seconds=10,
|
||||||
)
|
)
|
||||||
if response.status_code not in range(200, 400):
|
if response.status_code not in range(200, 400):
|
||||||
|
@ -109,7 +106,7 @@ class IndeedScraper(Scraper):
|
||||||
):
|
):
|
||||||
raise IndeedException("No jobs found.")
|
raise IndeedException("No jobs found.")
|
||||||
|
|
||||||
def process_job(job) -> Optional[JobPost]:
|
def process_job(job) -> JobPost | None:
|
||||||
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
|
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
|
||||||
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
|
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
|
@ -138,7 +135,7 @@ class IndeedScraper(Scraper):
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||||
date_posted = date_posted.strftime("%Y-%m-%d")
|
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
description = self.get_description(job_url, session)
|
description = self.get_description(job_url)
|
||||||
with io.StringIO(job["snippet"]) as f:
|
with io.StringIO(job["snippet"]) as f:
|
||||||
soup_io = BeautifulSoup(f, "html.parser")
|
soup_io = BeautifulSoup(f, "html.parser")
|
||||||
li_elements = soup_io.find_all("li")
|
li_elements = soup_io.find_all("li")
|
||||||
|
@ -179,20 +176,16 @@ class IndeedScraper(Scraper):
|
||||||
:param scraper_input:
|
:param scraper_input:
|
||||||
:return: job_response
|
:return: job_response
|
||||||
"""
|
"""
|
||||||
session = tls_client.Session(
|
|
||||||
client_identifier="chrome112", random_tls_extension_order=True
|
|
||||||
)
|
|
||||||
|
|
||||||
pages_to_process = (
|
pages_to_process = (
|
||||||
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
|
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
|
||||||
)
|
)
|
||||||
|
|
||||||
#: get first page to initialize session
|
#: get first page to initialize session
|
||||||
job_list, total_results = self.scrape_page(scraper_input, 0, session)
|
job_list, total_results = self.scrape_page(scraper_input, 0)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||||
futures: list[Future] = [
|
futures: list[Future] = [
|
||||||
executor.submit(self.scrape_page, scraper_input, page, session)
|
executor.submit(self.scrape_page, scraper_input, page)
|
||||||
for page in range(1, pages_to_process + 1)
|
for page in range(1, pages_to_process + 1)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -210,21 +203,24 @@ class IndeedScraper(Scraper):
|
||||||
)
|
)
|
||||||
return job_response
|
return job_response
|
||||||
|
|
||||||
def get_description(self, job_page_url: str, session: tls_client.Session) -> Optional[str]:
|
def get_description(self, job_page_url: str) -> str | None:
|
||||||
"""
|
"""
|
||||||
Retrieves job description by going to the job page url
|
Retrieves job description by going to the job page url
|
||||||
:param job_page_url:
|
:param job_page_url:
|
||||||
:param session:
|
|
||||||
:return: description
|
:return: description
|
||||||
"""
|
"""
|
||||||
parsed_url = urllib.parse.urlparse(job_page_url)
|
parsed_url = urllib.parse.urlparse(job_page_url)
|
||||||
params = urllib.parse.parse_qs(parsed_url.query)
|
params = urllib.parse.parse_qs(parsed_url.query)
|
||||||
jk_value = params.get("jk", [None])[0]
|
jk_value = params.get("jk", [None])[0]
|
||||||
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
|
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
|
||||||
|
session = self.create_session()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = session.get(
|
response = session.get(
|
||||||
formatted_url, allow_redirects=True, timeout_seconds=5, proxy=self.proxy
|
formatted_url,
|
||||||
|
headers=self.get_headers(),
|
||||||
|
allow_redirects=True,
|
||||||
|
timeout_seconds=5,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return None
|
return None
|
||||||
|
@ -241,7 +237,7 @@ class IndeedScraper(Scraper):
|
||||||
return text_content
|
return text_content
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_job_type(job: dict) -> Optional[list[JobType]]:
|
def get_job_type(job: dict) -> list[JobType] | None:
|
||||||
"""
|
"""
|
||||||
Parses the job to get list of job types
|
Parses the job to get list of job types
|
||||||
:param job:
|
:param job:
|
||||||
|
@ -276,7 +272,7 @@ class IndeedScraper(Scraper):
|
||||||
:return: jobs
|
:return: jobs
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def find_mosaic_script() -> Optional[Tag]:
|
def find_mosaic_script() -> Tag | None:
|
||||||
"""
|
"""
|
||||||
Finds jobcards script tag
|
Finds jobcards script tag
|
||||||
:return: script_tag
|
:return: script_tag
|
||||||
|
@ -326,3 +322,39 @@ class IndeedScraper(Scraper):
|
||||||
data = json.loads(json_str)
|
data = json.loads(json_str)
|
||||||
total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
|
total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
|
||||||
return total_num_jobs
|
return total_num_jobs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_headers():
|
||||||
|
return {
|
||||||
|
'authority': 'www.indeed.com',
|
||||||
|
'accept': '*/*',
|
||||||
|
'accept-language': 'en-US,en;q=0.9',
|
||||||
|
'referer': 'https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw',
|
||||||
|
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
def create_session(self):
|
||||||
|
"""
|
||||||
|
Creates a session with specific client identifiers and assigns proxies if available.
|
||||||
|
|
||||||
|
:return: A session object with or without proxies.
|
||||||
|
"""
|
||||||
|
session = tls_client.Session(
|
||||||
|
client_identifier="chrome112",
|
||||||
|
random_tls_extension_order=True,
|
||||||
|
)
|
||||||
|
session.proxies = self.proxy
|
||||||
|
# TODO multiple proxies
|
||||||
|
# if self.proxies:
|
||||||
|
# session.proxies = {
|
||||||
|
# "http": random.choice(self.proxies),
|
||||||
|
# "https": random.choice(self.proxies),
|
||||||
|
# }
|
||||||
|
|
||||||
|
return session
|
||||||
|
|
Loading…
Reference in New Issue