mirror of https://github.com/Bunsly/JobSpy
fix(indeed): fetch full description
parent
eb728a572a
commit
b4b836ff71
|
@ -26,8 +26,9 @@ class ScraperInput(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self, site: Site):
|
def __init__(self, site: Site, url: str):
|
||||||
self.site = site
|
self.site = site
|
||||||
|
self.url = url
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
...
|
...
|
||||||
|
|
|
@ -3,6 +3,7 @@ import json
|
||||||
from typing import Optional, Tuple, List
|
from typing import Optional, Tuple, List
|
||||||
|
|
||||||
import tls_client
|
import tls_client
|
||||||
|
import urllib.parse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from fastapi import status
|
from fastapi import status
|
||||||
|
@ -25,9 +26,8 @@ class IndeedScraper(Scraper):
|
||||||
Initializes IndeedScraper with the Indeed job search url
|
Initializes IndeedScraper with the Indeed job search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.INDEED)
|
site = Site(Site.INDEED)
|
||||||
super().__init__(site)
|
url = "https://www.indeed.com"
|
||||||
self.url = "https://www.indeed.com/jobs"
|
super().__init__(site, url)
|
||||||
self.job_url = "https://www.indeed.com/viewjob?jk="
|
|
||||||
|
|
||||||
self.jobs_per_page = 15
|
self.jobs_per_page = 15
|
||||||
self.seen_urls = set()
|
self.seen_urls = set()
|
||||||
|
@ -60,7 +60,7 @@ class IndeedScraper(Scraper):
|
||||||
|
|
||||||
if sc_values:
|
if sc_values:
|
||||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||||
response = session.get(self.url, params=params)
|
response = session.get(self.url + "/jobs", params=params)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
response.status_code != status.HTTP_200_OK
|
response.status_code != status.HTTP_200_OK
|
||||||
|
@ -82,10 +82,10 @@ class IndeedScraper(Scraper):
|
||||||
):
|
):
|
||||||
raise Exception("No jobs found.")
|
raise Exception("No jobs found.")
|
||||||
|
|
||||||
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
|
def process_job(job) -> Optional[JobPost]:
|
||||||
job_url = f'{self.job_url}{job["jobkey"]}'
|
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
continue
|
return None
|
||||||
|
|
||||||
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
|
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
|
||||||
|
|
||||||
|
@ -110,11 +110,8 @@ class IndeedScraper(Scraper):
|
||||||
job_type = IndeedScraper.get_job_type(job)
|
job_type = IndeedScraper.get_job_type(job)
|
||||||
timestamp_seconds = job["pubDate"] / 1000
|
timestamp_seconds = job["pubDate"] / 1000
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||||
li_elements = snippet_html.find_all("li")
|
description = self.get_description(job_url, session)
|
||||||
if li_elements:
|
|
||||||
description = " ".join(li.text for li in li_elements)
|
|
||||||
else:
|
|
||||||
description = None
|
|
||||||
first_li = snippet_html.find("li")
|
first_li = snippet_html.find("li")
|
||||||
job_post = JobPost(
|
job_post = JobPost(
|
||||||
title=job["normTitle"],
|
title=job["normTitle"],
|
||||||
|
@ -131,6 +128,10 @@ class IndeedScraper(Scraper):
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
)
|
)
|
||||||
|
return job_post
|
||||||
|
|
||||||
|
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
|
||||||
|
job_post = process_job(job)
|
||||||
job_list.append(job_post)
|
job_list.append(job_post)
|
||||||
|
|
||||||
return job_list, total_num_jobs
|
return job_list, total_num_jobs
|
||||||
|
@ -190,6 +191,27 @@ class IndeedScraper(Scraper):
|
||||||
)
|
)
|
||||||
return job_response
|
return job_response
|
||||||
|
|
||||||
|
def get_description(self, job_page_url: str, session: tls_client.Session) -> str:
|
||||||
|
"""
|
||||||
|
Retrieves job description by going to the job page url
|
||||||
|
:param job_page_url:
|
||||||
|
:param session:
|
||||||
|
:return: description
|
||||||
|
"""
|
||||||
|
parsed_url = urllib.parse.urlparse(job_page_url)
|
||||||
|
params = urllib.parse.parse_qs(parsed_url.query)
|
||||||
|
jk_value = params.get("jk", [None])[0]
|
||||||
|
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
|
||||||
|
|
||||||
|
response = session.get(formatted_url, allow_redirects=True)
|
||||||
|
|
||||||
|
raw_description = response.json()["body"]["jobInfoWrapperModel"][
|
||||||
|
"jobInfoModel"
|
||||||
|
]["sanitizedJobDescription"]
|
||||||
|
soup = BeautifulSoup(raw_description, "html.parser")
|
||||||
|
text_content = " ".join(soup.get_text().split()).strip()
|
||||||
|
return text_content
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_job_type(job: dict) -> Optional[JobType]:
|
def get_job_type(job: dict) -> Optional[JobType]:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -15,10 +15,8 @@ class LinkedInScraper(Scraper):
|
||||||
Initializes LinkedInScraper with the LinkedIn job search url
|
Initializes LinkedInScraper with the LinkedIn job search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.LINKEDIN)
|
site = Site(Site.LINKEDIN)
|
||||||
super().__init__(site)
|
url = "https://www.linkedin.com"
|
||||||
|
super().__init__(site, url)
|
||||||
self.url = "https://www.linkedin.com/jobs/search/"
|
|
||||||
self.job_url = "https://www.linkedin.com/jobs/view/"
|
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
|
@ -57,7 +55,9 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
params = {k: v for k, v in params.items() if v is not None}
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
print(params)
|
print(params)
|
||||||
response = session.get(self.url, params=params, allow_redirects=True)
|
response = session.get(
|
||||||
|
f"{self.url}/jobs/search", params=params, allow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
if response.status_code != status.HTTP_200_OK:
|
if response.status_code != status.HTTP_200_OK:
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
|
@ -82,7 +82,7 @@ class LinkedInScraper(Scraper):
|
||||||
job_id = (
|
job_id = (
|
||||||
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
|
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
|
||||||
)
|
)
|
||||||
job_url = f"{self.job_url}{job_id}"
|
job_url = f"{self.url}/jobs/view/{job_id}"
|
||||||
if job_url in seen_urls:
|
if job_url in seen_urls:
|
||||||
continue
|
continue
|
||||||
seen_urls.add(job_url)
|
seen_urls.add(job_url)
|
||||||
|
|
|
@ -5,6 +5,7 @@ from urllib.parse import urlparse, parse_qs
|
||||||
import tls_client
|
import tls_client
|
||||||
from fastapi import status
|
from fastapi import status
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Tag
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
|
||||||
from api.core.jobs import JobPost
|
from api.core.jobs import JobPost
|
||||||
|
@ -14,14 +15,13 @@ import math
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
url = "https://www.ziprecruiter.com"
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""
|
"""
|
||||||
Initializes LinkedInScraper with the ZipRecruiter job search url
|
Initializes LinkedInScraper with the ZipRecruiter job search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.ZIP_RECRUITER)
|
site = Site(Site.ZIP_RECRUITER)
|
||||||
super().__init__(site)
|
url = "https://www.ziprecruiter.com"
|
||||||
|
super().__init__(site, url)
|
||||||
|
|
||||||
self.jobs_per_page = 20
|
self.jobs_per_page = 20
|
||||||
self.seen_urls = set()
|
self.seen_urls = set()
|
||||||
|
@ -84,11 +84,11 @@ class ZipRecruiterScraper(Scraper):
|
||||||
job_posts = soup.find_all("div", {"class": "job_content"})
|
job_posts = soup.find_all("div", {"class": "job_content"})
|
||||||
|
|
||||||
def process_job(job: Tag) -> Optional[JobPost]:
|
def process_job(job: Tag) -> Optional[JobPost]:
|
||||||
'''
|
"""
|
||||||
Parses a job from the job content tag
|
Parses a job from the job content tag
|
||||||
:param job: BeautifulSoup Tag for one job post
|
:param job: BeautifulSoup Tag for one job post
|
||||||
:return JobPost
|
:return JobPost
|
||||||
'''
|
"""
|
||||||
job_url = job.find("a", {"class": "job_link"})["href"]
|
job_url = job.find("a", {"class": "job_link"})["href"]
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
return None
|
return None
|
||||||
|
@ -201,7 +201,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
|
|
||||||
job_description_div = soup_job.find("div", {"class": "job_description"})
|
job_description_div = soup_job.find("div", {"class": "job_description"})
|
||||||
if job_description_div:
|
if job_description_div:
|
||||||
return job_description_div.text.strip("\n"), response.url
|
return job_description_div.text.strip(), response.url
|
||||||
return None, response.url
|
return None, response.url
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
Loading…
Reference in New Issue