formatting

pull/38/head
Cullen Watson 2023-09-05 12:16:10 -05:00
parent ccc53854a1
commit 20801022a1
5 changed files with 127 additions and 99 deletions

View File

@ -5,12 +5,7 @@ from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper from .scrapers.linkedin import LinkedInScraper
from .scrapers import ( from .scrapers import ScraperInput, Site, JobResponse, Country
ScraperInput,
Site,
JobResponse,
Country
)
SCRAPER_MAPPING = { SCRAPER_MAPPING = {
@ -33,7 +28,7 @@ def scrape_jobs(
job_type: JobType = None, job_type: JobType = None,
easy_apply: bool = False, # linkedin easy_apply: bool = False, # linkedin
results_wanted: int = 15, results_wanted: int = 15,
country: str = 'usa' country: str = "usa",
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Asynchronously scrapes job data from multiple job sites. Asynchronously scrapes job data from multiple job sites.
@ -76,14 +71,14 @@ def scrape_jobs(
for job in job_response.jobs: for job in job_response.jobs:
data = job.dict() data = job.dict()
data["site"] = site data["site"] = site
data['company'] = data['company_name'] data["company"] = data["company_name"]
if data["job_type"]: if data["job_type"]:
# Take the first value from the job type tuple # Take the first value from the job type tuple
data["job_type"] = data["job_type"].value[0] data["job_type"] = data["job_type"].value[0]
else: else:
data["job_type"] = None data["job_type"] = None
data['location'] = Location(**data['location']).display_location() data["location"] = Location(**data["location"]).display_location()
compensation_obj = data.get("compensation") compensation_obj = data.get("compensation")
if compensation_obj and isinstance(compensation_obj, dict): if compensation_obj and isinstance(compensation_obj, dict):
@ -110,7 +105,7 @@ def scrape_jobs(
"site", "site",
"title", "title",
"company", "company",
'location', "location",
"job_type", "job_type",
"interval", "interval",
"min_amount", "min_amount",

View File

@ -6,13 +6,41 @@ from pydantic import BaseModel, validator
class JobType(Enum): class JobType(Enum):
FULL_TIME = ("fulltime", "períodointegral", "estágio/trainee", "cunormăîntreagă", "tiempocompleto", "vollzeit", "voltijds", "tempointegral", "全职", 'plnýúvazek', 'fuldtid', 'دوامكامل' , FULL_TIME = (
'kokopäivätyö', 'tempsplein', 'vollzeit', 'πλήρηςαπασχόληση', 'teljesmunkaidő', 'tempopieno', 'tempsplein', 'heltid', 'jornadacompleta', 'pełnyetat', '정규직', '100%', '全職', "fulltime",
'งานประจำ', 'tamzamanlı', 'повназайнятість', 'toànthờigian') "períodointegral",
"estágio/trainee",
"cunormăîntreagă",
"tiempocompleto",
"vollzeit",
"voltijds",
"tempointegral",
"全职",
"plnýúvazek",
"fuldtid",
"دوامكامل",
"kokopäivätyö",
"tempsplein",
"vollzeit",
"πλήρηςαπασχόληση",
"teljesmunkaidő",
"tempopieno",
"tempsplein",
"heltid",
"jornadacompleta",
"pełnyetat",
"정규직",
"100%",
"全職",
"งานประจำ",
"tamzamanlı",
"повназайнятість",
"toànthờigian",
)
PART_TIME = ("parttime", "teilzeit") PART_TIME = ("parttime", "teilzeit")
CONTRACT = ("contract", "contractor") CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",) TEMPORARY = ("temporary",)
INTERNSHIP = ("internship", "prácticas", 'ojt(onthejobtraining)', 'praktikum') INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum")
PER_DIEM = ("perdiem",) PER_DIEM = ("perdiem",)
NIGHTS = ("nights",) NIGHTS = ("nights",)
@ -22,74 +50,74 @@ class JobType(Enum):
class Country(Enum): class Country(Enum):
ARGENTINA = ('argentina', 'ar') ARGENTINA = ("argentina", "ar")
AUSTRALIA = ('australia', 'au') AUSTRALIA = ("australia", "au")
AUSTRIA = ('austria', 'at') AUSTRIA = ("austria", "at")
BAHRAIN = ('bahrain', 'bh') BAHRAIN = ("bahrain", "bh")
BELGIUM = ('belgium', 'be') BELGIUM = ("belgium", "be")
BRAZIL = ('brazil', 'br') BRAZIL = ("brazil", "br")
CANADA = ('canada', 'ca') CANADA = ("canada", "ca")
CHILE = ('chile', 'cl') CHILE = ("chile", "cl")
CHINA = ('china', 'cn') CHINA = ("china", "cn")
COLOMBIA = ('colombia', 'co') COLOMBIA = ("colombia", "co")
COSTARICA = ('costa rica', 'cr') COSTARICA = ("costa rica", "cr")
CZECHREPUBLIC = ('czech republic', 'cz') CZECHREPUBLIC = ("czech republic", "cz")
DENMARK = ('denmark', 'dk') DENMARK = ("denmark", "dk")
ECUADOR = ('ecuador', 'ec') ECUADOR = ("ecuador", "ec")
EGYPT = ('egypt', 'eg') EGYPT = ("egypt", "eg")
FINLAND = ('finland', 'fi') FINLAND = ("finland", "fi")
FRANCE = ('france', 'fr') FRANCE = ("france", "fr")
GERMANY = ('germany', 'de') GERMANY = ("germany", "de")
GREECE = ('greece', 'gr') GREECE = ("greece", "gr")
HONGKONG = ('hong kong', 'hk') HONGKONG = ("hong kong", "hk")
HUNGARY = ('hungary', 'hu') HUNGARY = ("hungary", "hu")
INDIA = ('india', 'in') INDIA = ("india", "in")
INDONESIA = ('indonesia', 'id') INDONESIA = ("indonesia", "id")
IRELAND = ('ireland', 'ie') IRELAND = ("ireland", "ie")
ISRAEL = ('israel', 'il') ISRAEL = ("israel", "il")
ITALY = ('italy', 'it') ITALY = ("italy", "it")
JAPAN = ('japan', 'jp') JAPAN = ("japan", "jp")
KUWAIT = ('kuwait', 'kw') KUWAIT = ("kuwait", "kw")
LUXEMBOURG = ('luxembourg', 'lu') LUXEMBOURG = ("luxembourg", "lu")
MALAYSIA = ('malaysia', 'malaysia') MALAYSIA = ("malaysia", "malaysia")
MEXICO = ('mexico', 'mx') MEXICO = ("mexico", "mx")
MOROCCO = ('morocco', 'ma') MOROCCO = ("morocco", "ma")
NETHERLANDS = ('netherlands', 'nl') NETHERLANDS = ("netherlands", "nl")
NEWZEALAND = ('new zealand', 'nz') NEWZEALAND = ("new zealand", "nz")
NIGERIA = ('nigeria', 'ng') NIGERIA = ("nigeria", "ng")
NORWAY = ('norway', 'no') NORWAY = ("norway", "no")
OMAN = ('oman', 'om') OMAN = ("oman", "om")
PAKISTAN = ('pakistan', 'pk') PAKISTAN = ("pakistan", "pk")
PANAMA = ('panama', 'pa') PANAMA = ("panama", "pa")
PERU = ('peru', 'pe') PERU = ("peru", "pe")
PHILIPPINES = ('philippines', 'ph') PHILIPPINES = ("philippines", "ph")
POLAND = ('poland', 'pl') POLAND = ("poland", "pl")
PORTUGAL = ('portugal', 'pt') PORTUGAL = ("portugal", "pt")
QATAR = ('qatar', 'qa') QATAR = ("qatar", "qa")
ROMANIA = ('romania', 'ro') ROMANIA = ("romania", "ro")
SAUDIARABIA = ('saudi arabia', 'sa') SAUDIARABIA = ("saudi arabia", "sa")
SINGAPORE = ('singapore', 'sg') SINGAPORE = ("singapore", "sg")
SOUTHAFRICA = ('south africa', 'za') SOUTHAFRICA = ("south africa", "za")
SOUTHKOREA = ('south korea', 'kr') SOUTHKOREA = ("south korea", "kr")
SPAIN = ('spain', 'es') SPAIN = ("spain", "es")
SWEDEN = ('sweden', 'se') SWEDEN = ("sweden", "se")
SWITZERLAND = ('switzerland', 'ch') SWITZERLAND = ("switzerland", "ch")
TAIWAN = ('taiwan', 'tw') TAIWAN = ("taiwan", "tw")
THAILAND = ('thailand', 'th') THAILAND = ("thailand", "th")
TURKEY = ('turkey', 'tr') TURKEY = ("turkey", "tr")
UKRAINE = ('ukraine', 'ua') UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ('united arab emirates', 'ae') UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ('uk', 'uk') UK = ("uk", "uk")
USA = ('usa', 'www') USA = ("usa", "www")
URUGUAY = ('uruguay', 'uy') URUGUAY = ("uruguay", "uy")
VENEZUELA = ('venezuela', 've') VENEZUELA = ("venezuela", "ve")
VIETNAM = ('vietnam', 'vn') VIETNAM = ("vietnam", "vn")
# internal for ziprecruiter # internal for ziprecruiter
US_CANADA = ('usa/ca', 'www') US_CANADA = ("usa/ca", "www")
# internal for linkeind # internal for linkeind
WORLDWIDE = ('worldwide', 'www') WORLDWIDE = ("worldwide", "www")
def __new__(cls, country, domain): def __new__(cls, country, domain):
obj = object.__new__(cls) obj = object.__new__(cls)
@ -109,7 +137,9 @@ class Country(Enum):
if country.value == country_str: if country.value == country_str:
return country return country
valid_countries = [country.value for country in cls] valid_countries = [country.value for country in cls]
raise ValueError(f"Invalid country string: '{country_str}'. Valid countries (only include this param for Indeed) are: {', '.join(valid_countries)}") raise ValueError(
f"Invalid country string: '{country_str}'. Valid countries (only include this param for Indeed) are: {', '.join(valid_countries)}"
)
class Location(BaseModel): class Location(BaseModel):
@ -124,7 +154,7 @@ class Location(BaseModel):
if self.state: if self.state:
location_parts.append(self.state) location_parts.append(self.state)
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE): if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
if self.country.value in ('usa', 'uk'): if self.country.value in ("usa", "uk"):
location_parts.append(self.country.value.upper()) location_parts.append(self.country.value.upper())
else: else:
location_parts.append(self.country.value.title()) location_parts.append(self.country.value.title())

View File

@ -38,8 +38,6 @@ class IndeedScraper(Scraper):
self.jobs_per_page = 15 self.jobs_per_page = 15
self.seen_urls = set() self.seen_urls = set()
def scrape_page( def scrape_page(
self, scraper_input: ScraperInput, page: int, session: tls_client.Session self, scraper_input: ScraperInput, page: int, session: tls_client.Session
) -> tuple[list[JobPost], int]: ) -> tuple[list[JobPost], int]:
@ -80,7 +78,7 @@ class IndeedScraper(Scraper):
raise StatusException(response.status_code) raise StatusException(response.status_code)
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
with open('text2.html', 'w', encoding='utf-8') as f: with open("text2.html", "w", encoding="utf-8") as f:
f.write(str(soup)) f.write(str(soup))
if "did not match any jobs" in str(soup): if "did not match any jobs" in str(soup):
raise ParsingException("Search did not match any jobs") raise ParsingException("Search did not match any jobs")
@ -103,7 +101,6 @@ class IndeedScraper(Scraper):
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
extracted_salary = job.get("extractedSalary") extracted_salary = job.get("extractedSalary")
compensation = None compensation = None
if extracted_salary: if extracted_salary:
@ -141,7 +138,7 @@ class IndeedScraper(Scraper):
location=Location( location=Location(
city=job.get("jobLocationCity"), city=job.get("jobLocationCity"),
state=job.get("jobLocationState"), state=job.get("jobLocationState"),
country=self.country country=self.country,
), ),
job_type=job_type, job_type=job_type,
compensation=compensation, compensation=compensation,
@ -229,13 +226,15 @@ class IndeedScraper(Scraper):
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1" formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
try: try:
response = session.get(formatted_url, allow_redirects=True, timeout_seconds=5) response = session.get(
formatted_url, allow_redirects=True, timeout_seconds=5
)
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
print("The request timed out.") print("The request timed out.")
return None return None
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
print('status code not in range') print("status code not in range")
return None return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][ raw_description = response.json()["body"]["jobInfoWrapperModel"][

View File

@ -31,7 +31,7 @@ class LinkedInScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
self.country = 'worldwide' self.country = "worldwide"
job_list: list[JobPost] = [] job_list: list[JobPost] = []
seen_urls = set() seen_urls = set()
page, processed_jobs, job_count = 0, 0, 0 page, processed_jobs, job_count = 0, 0, 0

View File

@ -19,7 +19,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
Country Country,
) )
@ -82,7 +82,7 @@ class ZipRecruiterScraper(Scraper):
self.url + "/jobs-search", self.url + "/jobs-search",
headers=ZipRecruiterScraper.headers(), headers=ZipRecruiterScraper.headers(),
params=params, params=params,
allow_redirects=True allow_redirects=True,
) )
# print(response.status_code) # print(response.status_code)
@ -214,7 +214,9 @@ class ZipRecruiterScraper(Scraper):
).get_text() ).get_text()
company = job.get("OrgName") company = job.get("OrgName")
location = Location(city=job.get("City"), state=job.get("State"), country=Country.US_CANADA) location = Location(
city=job.get("City"), state=job.get("State"), country=Country.US_CANADA
)
try: try:
job_type = ZipRecruiterScraper.get_job_type_enum( job_type = ZipRecruiterScraper.get_job_type_enum(
job.get("EmploymentType", "").replace("-", "_").lower() job.get("EmploymentType", "").replace("-", "_").lower()
@ -245,7 +247,7 @@ class ZipRecruiterScraper(Scraper):
interval=CompensationInterval.YEARLY, interval=CompensationInterval.YEARLY,
min_amount=min_amount, min_amount=min_amount,
max_amount=max_amount, max_amount=max_amount,
currency = "USD/CAD" currency="USD/CAD",
) )
save_job_url = job.get("SaveJobURL", "") save_job_url = job.get("SaveJobURL", "")
posted_time_match = re.search( posted_time_match = re.search(
@ -294,7 +296,10 @@ class ZipRecruiterScraper(Scraper):
""" """
try: try:
response = self.session.get( response = self.session.get(
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True, timeout_seconds=5 job_page_url,
headers=ZipRecruiterScraper.headers(),
allow_redirects=True,
timeout_seconds=5,
) )
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
print("The request timed out.") print("The request timed out.")
@ -380,7 +385,10 @@ class ZipRecruiterScraper(Scraper):
amounts.append(amount) amounts.append(amount)
compensation = Compensation( compensation = Compensation(
interval=interval, min_amount=min(amounts), max_amount=max(amounts), currency="USD/CAD" interval=interval,
min_amount=min(amounts),
max_amount=max(amounts),
currency="USD/CAD",
) )
return compensation return compensation
@ -404,11 +412,7 @@ class ZipRecruiterScraper(Scraper):
city, state = None, None city, state = None, None
else: else:
city, state = None, None city, state = None, None
return Location( return Location(city=city, state=state, country=Country.US_CANADA)
city=city,
state=state,
country=Country.US_CANADA
)
@staticmethod @staticmethod
def headers() -> dict: def headers() -> dict: