add offset param & email extraction (#51)

* add offset param

* [enh]: extract emails
This commit is contained in:
Cullen Watson
2023-09-28 18:11:28 -05:00
committed by GitHub
parent 286b9e1256
commit af07c1ecbd
17 changed files with 1209 additions and 1126 deletions

View File

@@ -8,7 +8,6 @@ import re
import math
import io
import json
import traceback
from datetime import datetime
from typing import Optional
@@ -27,7 +26,8 @@ from ...jobs import (
JobResponse,
JobType,
)
from .. import Scraper, ScraperInput, Site, Country
from .. import Scraper, ScraperInput, Site
from ...utils import extract_emails_from_text
class IndeedScraper(Scraper):
@@ -35,6 +35,8 @@ class IndeedScraper(Scraper):
"""
Initializes IndeedScraper with the Indeed job search url
"""
self.url = None
self.country = None
site = Site(Site.INDEED)
super().__init__(site, proxy=proxy)
@@ -42,7 +44,7 @@ class IndeedScraper(Scraper):
self.seen_urls = set()
def scrape_page(
self, scraper_input: ScraperInput, page: int, session: tls_client.Session
self, scraper_input: ScraperInput, page: int, session: tls_client.Session
) -> tuple[list[JobPost], int]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
@@ -61,7 +63,7 @@ class IndeedScraper(Scraper):
"q": scraper_input.search_term,
"l": scraper_input.location,
"filter": 0,
"start": 0 + page * 10,
"start": scraper_input.offset + page * 10,
}
if scraper_input.distance:
params["radius"] = scraper_input.distance
@@ -76,7 +78,7 @@ class IndeedScraper(Scraper):
params["sc"] = "0kf:" + "".join(sc_values) + ";"
try:
response = session.get(
self.url + "/jobs",
f"{self.url}/jobs",
params=params,
allow_redirects=True,
proxy=self.proxy,
@@ -101,9 +103,9 @@ class IndeedScraper(Scraper):
total_num_jobs = IndeedScraper.total_jobs(soup)
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
raise IndeedException("No jobs found.")
@@ -137,9 +139,10 @@ class IndeedScraper(Scraper):
date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url, session)
emails = extract_emails_from_text(description)
with io.StringIO(job["snippet"]) as f:
soup = BeautifulSoup(f, "html.parser")
li_elements = soup.find_all("li")
soup_io = BeautifulSoup(f, "html.parser")
li_elements = soup_io.find_all("li")
if description is None and li_elements:
description = " ".join(li.text for li in li_elements)
@@ -152,6 +155,7 @@ class IndeedScraper(Scraper):
state=job.get("jobLocationState"),
country=self.country,
),
emails=extract_emails_from_text(description),
job_type=job_type,
compensation=compensation,
date_posted=date_posted,
@@ -180,7 +184,7 @@ class IndeedScraper(Scraper):
)
pages_to_process = (
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
)
#: get first page to initialize session
@@ -206,7 +210,7 @@ class IndeedScraper(Scraper):
)
return job_response
def get_description(self, job_page_url: str, session: tls_client.Session) -> str:
def get_description(self, job_page_url: str, session: tls_client.Session) -> Optional[str]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
@@ -249,13 +253,17 @@ class IndeedScraper(Scraper):
label = taxonomy["attributes"][0].get("label")
if label:
job_type_str = label.replace("-", "").replace(" ", "").lower()
return IndeedScraper.get_enum_from_value(job_type_str)
return IndeedScraper.get_enum_from_job_type(job_type_str)
return None
@staticmethod
def get_enum_from_value(value_str):
def get_enum_from_job_type(job_type_str):
"""
Given a string, returns the corresponding JobType enum member if a match is found.
for job_type in JobType:
if value_str in job_type.value:
"""
for job_type in JobType:
if job_type_str in job_type.value:
return job_type
return None
@@ -276,9 +284,9 @@ class IndeedScraper(Scraper):
for tag in script_tags:
if (
tag.string
and "mosaic.providerData" in tag.string
and "mosaic-provider-jobcards" in tag.string
tag.string
and "mosaic.providerData" in tag.string
and "mosaic-provider-jobcards" in tag.string
):
return tag
return None