diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index c6e2197..1b6c68c 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -5,12 +5,7 @@ from .jobs import JobType, Location from .scrapers.indeed import IndeedScraper from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.linkedin import LinkedInScraper -from .scrapers import ( - ScraperInput, - Site, - JobResponse, - Country -) +from .scrapers import ScraperInput, Site, JobResponse, Country SCRAPER_MAPPING = { @@ -33,7 +28,7 @@ def scrape_jobs( job_type: JobType = None, easy_apply: bool = False, # linkedin results_wanted: int = 15, - country: str = 'usa' + country: str = "usa", ) -> pd.DataFrame: """ Asynchronously scrapes job data from multiple job sites. @@ -76,14 +71,14 @@ def scrape_jobs( for job in job_response.jobs: data = job.dict() data["site"] = site - data['company'] = data['company_name'] + data["company"] = data["company_name"] if data["job_type"]: # Take the first value from the job type tuple data["job_type"] = data["job_type"].value[0] else: data["job_type"] = None - data['location'] = Location(**data['location']).display_location() + data["location"] = Location(**data["location"]).display_location() compensation_obj = data.get("compensation") if compensation_obj and isinstance(compensation_obj, dict): @@ -110,7 +105,7 @@ def scrape_jobs( "site", "title", "company", - 'location', + "location", "job_type", "interval", "min_amount", diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 8984233..ebc2e1f 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -6,13 +6,41 @@ from pydantic import BaseModel, validator class JobType(Enum): - FULL_TIME = ("fulltime", "períodointegral", "estágio/trainee", "cunormăîntreagă", "tiempocompleto", "vollzeit", "voltijds", "tempointegral", "全职", 'plnýúvazek', 'fuldtid', 'دوامكامل' , - 'kokopäivätyö', 'tempsplein', 'vollzeit', 'πλήρηςαπασχόληση', 'teljesmunkaidő', 'tempopieno', 'tempsplein', 'heltid', 'jornadacompleta', 'pełnyetat', '정규직', '100%', '全職', - 'งานประจำ', 'tamzamanlı', 'повназайнятість', 'toànthờigian') + FULL_TIME = ( + "fulltime", + "períodointegral", + "estágio/trainee", + "cunormăîntreagă", + "tiempocompleto", + "vollzeit", + "voltijds", + "tempointegral", + "全职", + "plnýúvazek", + "fuldtid", + "دوامكامل", + "kokopäivätyö", + "tempsplein", + "vollzeit", + "πλήρηςαπασχόληση", + "teljesmunkaidő", + "tempopieno", + "tempsplein", + "heltid", + "jornadacompleta", + "pełnyetat", + "정규직", + "100%", + "全職", + "งานประจำ", + "tamzamanlı", + "повназайнятість", + "toànthờigian", + ) PART_TIME = ("parttime", "teilzeit") CONTRACT = ("contract", "contractor") TEMPORARY = ("temporary",) - INTERNSHIP = ("internship", "prácticas", 'ojt(onthejobtraining)', 'praktikum') + INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum") PER_DIEM = ("perdiem",) NIGHTS = ("nights",) @@ -22,74 +50,74 @@ class JobType(Enum): class Country(Enum): - ARGENTINA = ('argentina', 'ar') - AUSTRALIA = ('australia', 'au') - AUSTRIA = ('austria', 'at') - BAHRAIN = ('bahrain', 'bh') - BELGIUM = ('belgium', 'be') - BRAZIL = ('brazil', 'br') - CANADA = ('canada', 'ca') - CHILE = ('chile', 'cl') - CHINA = ('china', 'cn') - COLOMBIA = ('colombia', 'co') - COSTARICA = ('costa rica', 'cr') - CZECHREPUBLIC = ('czech republic', 'cz') - DENMARK = ('denmark', 'dk') - ECUADOR = ('ecuador', 'ec') - EGYPT = ('egypt', 'eg') - FINLAND = ('finland', 'fi') - FRANCE = ('france', 'fr') - GERMANY = ('germany', 'de') - GREECE = ('greece', 'gr') - HONGKONG = ('hong kong', 'hk') - HUNGARY = ('hungary', 'hu') - INDIA = ('india', 'in') - INDONESIA = ('indonesia', 'id') - IRELAND = ('ireland', 'ie') - ISRAEL = ('israel', 'il') - ITALY = ('italy', 'it') - JAPAN = ('japan', 'jp') - KUWAIT = ('kuwait', 'kw') - LUXEMBOURG = ('luxembourg', 'lu') - MALAYSIA = ('malaysia', 'malaysia') - MEXICO = ('mexico', 'mx') - MOROCCO = ('morocco', 'ma') - NETHERLANDS = ('netherlands', 'nl') - NEWZEALAND = ('new zealand', 'nz') - NIGERIA = ('nigeria', 'ng') - NORWAY = ('norway', 'no') - OMAN = ('oman', 'om') - PAKISTAN = ('pakistan', 'pk') - PANAMA = ('panama', 'pa') - PERU = ('peru', 'pe') - PHILIPPINES = ('philippines', 'ph') - POLAND = ('poland', 'pl') - PORTUGAL = ('portugal', 'pt') - QATAR = ('qatar', 'qa') - ROMANIA = ('romania', 'ro') - SAUDIARABIA = ('saudi arabia', 'sa') - SINGAPORE = ('singapore', 'sg') - SOUTHAFRICA = ('south africa', 'za') - SOUTHKOREA = ('south korea', 'kr') - SPAIN = ('spain', 'es') - SWEDEN = ('sweden', 'se') - SWITZERLAND = ('switzerland', 'ch') - TAIWAN = ('taiwan', 'tw') - THAILAND = ('thailand', 'th') - TURKEY = ('turkey', 'tr') - UKRAINE = ('ukraine', 'ua') - UNITEDARABEMIRATES = ('united arab emirates', 'ae') - UK = ('uk', 'uk') - USA = ('usa', 'www') - URUGUAY = ('uruguay', 'uy') - VENEZUELA = ('venezuela', 've') - VIETNAM = ('vietnam', 'vn') + ARGENTINA = ("argentina", "ar") + AUSTRALIA = ("australia", "au") + AUSTRIA = ("austria", "at") + BAHRAIN = ("bahrain", "bh") + BELGIUM = ("belgium", "be") + BRAZIL = ("brazil", "br") + CANADA = ("canada", "ca") + CHILE = ("chile", "cl") + CHINA = ("china", "cn") + COLOMBIA = ("colombia", "co") + COSTARICA = ("costa rica", "cr") + CZECHREPUBLIC = ("czech republic", "cz") + DENMARK = ("denmark", "dk") + ECUADOR = ("ecuador", "ec") + EGYPT = ("egypt", "eg") + FINLAND = ("finland", "fi") + FRANCE = ("france", "fr") + GERMANY = ("germany", "de") + GREECE = ("greece", "gr") + HONGKONG = ("hong kong", "hk") + HUNGARY = ("hungary", "hu") + INDIA = ("india", "in") + INDONESIA = ("indonesia", "id") + IRELAND = ("ireland", "ie") + ISRAEL = ("israel", "il") + ITALY = ("italy", "it") + JAPAN = ("japan", "jp") + KUWAIT = ("kuwait", "kw") + LUXEMBOURG = ("luxembourg", "lu") + MALAYSIA = ("malaysia", "malaysia") + MEXICO = ("mexico", "mx") + MOROCCO = ("morocco", "ma") + NETHERLANDS = ("netherlands", "nl") + NEWZEALAND = ("new zealand", "nz") + NIGERIA = ("nigeria", "ng") + NORWAY = ("norway", "no") + OMAN = ("oman", "om") + PAKISTAN = ("pakistan", "pk") + PANAMA = ("panama", "pa") + PERU = ("peru", "pe") + PHILIPPINES = ("philippines", "ph") + POLAND = ("poland", "pl") + PORTUGAL = ("portugal", "pt") + QATAR = ("qatar", "qa") + ROMANIA = ("romania", "ro") + SAUDIARABIA = ("saudi arabia", "sa") + SINGAPORE = ("singapore", "sg") + SOUTHAFRICA = ("south africa", "za") + SOUTHKOREA = ("south korea", "kr") + SPAIN = ("spain", "es") + SWEDEN = ("sweden", "se") + SWITZERLAND = ("switzerland", "ch") + TAIWAN = ("taiwan", "tw") + THAILAND = ("thailand", "th") + TURKEY = ("turkey", "tr") + UKRAINE = ("ukraine", "ua") + UNITEDARABEMIRATES = ("united arab emirates", "ae") + UK = ("uk", "uk") + USA = ("usa", "www") + URUGUAY = ("uruguay", "uy") + VENEZUELA = ("venezuela", "ve") + VIETNAM = ("vietnam", "vn") # internal for ziprecruiter - US_CANADA = ('usa/ca', 'www') + US_CANADA = ("usa/ca", "www") # internal for linkeind - WORLDWIDE = ('worldwide', 'www') + WORLDWIDE = ("worldwide", "www") def __new__(cls, country, domain): obj = object.__new__(cls) @@ -109,7 +137,9 @@ class Country(Enum): if country.value == country_str: return country valid_countries = [country.value for country in cls] - raise ValueError(f"Invalid country string: '{country_str}'. Valid countries (only include this param for Indeed) are: {', '.join(valid_countries)}") + raise ValueError( + f"Invalid country string: '{country_str}'. Valid countries (only include this param for Indeed) are: {', '.join(valid_countries)}" + ) class Location(BaseModel): @@ -124,7 +154,7 @@ class Location(BaseModel): if self.state: location_parts.append(self.state) if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE): - if self.country.value in ('usa', 'uk'): + if self.country.value in ("usa", "uk"): location_parts.append(self.country.value.upper()) else: location_parts.append(self.country.value.title()) diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 72815b3..70df33b 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -38,8 +38,6 @@ class IndeedScraper(Scraper): self.jobs_per_page = 15 self.seen_urls = set() - - def scrape_page( self, scraper_input: ScraperInput, page: int, session: tls_client.Session ) -> tuple[list[JobPost], int]: @@ -80,7 +78,7 @@ class IndeedScraper(Scraper): raise StatusException(response.status_code) soup = BeautifulSoup(response.content, "html.parser") - with open('text2.html', 'w', encoding='utf-8') as f: + with open("text2.html", "w", encoding="utf-8") as f: f.write(str(soup)) if "did not match any jobs" in str(soup): raise ParsingException("Search did not match any jobs") @@ -103,7 +101,6 @@ class IndeedScraper(Scraper): if job_url in self.seen_urls: return None - extracted_salary = job.get("extractedSalary") compensation = None if extracted_salary: @@ -141,7 +138,7 @@ class IndeedScraper(Scraper): location=Location( city=job.get("jobLocationCity"), state=job.get("jobLocationState"), - country=self.country + country=self.country, ), job_type=job_type, compensation=compensation, @@ -229,13 +226,15 @@ class IndeedScraper(Scraper): formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1" try: - response = session.get(formatted_url, allow_redirects=True, timeout_seconds=5) + response = session.get( + formatted_url, allow_redirects=True, timeout_seconds=5 + ) except requests.exceptions.Timeout: print("The request timed out.") return None if response.status_code not in range(200, 400): - print('status code not in range') + print("status code not in range") return None raw_description = response.json()["body"]["jobInfoWrapperModel"][ diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 5a7a297..03ca40f 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -31,7 +31,7 @@ class LinkedInScraper(Scraper): :param scraper_input: :return: job_response """ - self.country = 'worldwide' + self.country = "worldwide" job_list: list[JobPost] = [] seen_urls = set() page, processed_jobs, job_count = 0, 0, 0 diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index e19db8f..881b123 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -19,7 +19,7 @@ from ...jobs import ( Location, JobResponse, JobType, - Country + Country, ) @@ -82,7 +82,7 @@ class ZipRecruiterScraper(Scraper): self.url + "/jobs-search", headers=ZipRecruiterScraper.headers(), params=params, - allow_redirects=True + allow_redirects=True, ) # print(response.status_code) @@ -214,7 +214,9 @@ class ZipRecruiterScraper(Scraper): ).get_text() company = job.get("OrgName") - location = Location(city=job.get("City"), state=job.get("State"), country=Country.US_CANADA) + location = Location( + city=job.get("City"), state=job.get("State"), country=Country.US_CANADA + ) try: job_type = ZipRecruiterScraper.get_job_type_enum( job.get("EmploymentType", "").replace("-", "_").lower() @@ -245,7 +247,7 @@ class ZipRecruiterScraper(Scraper): interval=CompensationInterval.YEARLY, min_amount=min_amount, max_amount=max_amount, - currency = "USD/CAD" + currency="USD/CAD", ) save_job_url = job.get("SaveJobURL", "") posted_time_match = re.search( @@ -294,7 +296,10 @@ class ZipRecruiterScraper(Scraper): """ try: response = self.session.get( - job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True, timeout_seconds=5 + job_page_url, + headers=ZipRecruiterScraper.headers(), + allow_redirects=True, + timeout_seconds=5, ) except requests.exceptions.Timeout: print("The request timed out.") @@ -380,7 +385,10 @@ class ZipRecruiterScraper(Scraper): amounts.append(amount) compensation = Compensation( - interval=interval, min_amount=min(amounts), max_amount=max(amounts), currency="USD/CAD" + interval=interval, + min_amount=min(amounts), + max_amount=max(amounts), + currency="USD/CAD", ) return compensation @@ -404,11 +412,7 @@ class ZipRecruiterScraper(Scraper): city, state = None, None else: city, state = None, None - return Location( - city=city, - state=state, - country=Country.US_CANADA - ) + return Location(city=city, state=state, country=Country.US_CANADA) @staticmethod def headers() -> dict: