From 35bdff65a98341dedc3b8b14b47b56ea7d880643 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Tue, 10 Oct 2023 11:20:24 -0500 Subject: [PATCH] chore: readme --- README.md | 31 +-- examples/JobSpy_Demo.py | 14 +- pyproject.toml | 2 +- src/jobspy/__init__.py | 37 +-- src/jobspy/jobs/__init__.py | 2 +- src/jobspy/scrapers/indeed/__init__.py | 79 +++---- src/jobspy/scrapers/linkedin/__init__.py | 34 ++- src/jobspy/scrapers/ziprecruiter/__init__.py | 224 ++++++++++--------- src/tests/test_all.py | 4 +- src/tests/test_indeed.py | 4 +- src/tests/test_linkedin.py | 4 +- src/tests/test_ziprecruiter.py | 4 +- 12 files changed, 225 insertions(+), 214 deletions(-) diff --git a/README.md b/README.md index 2f99193..269f436 100644 --- a/README.md +++ b/README.md @@ -33,37 +33,19 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) ```python from jobspy import scrape_jobs -import pandas as pd -jobs: pd.DataFrame = scrape_jobs( +jobs = scrape_jobs( site_name=["indeed", "linkedin", "zip_recruiter"], search_term="software engineer", location="Dallas, TX", results_wanted=10, - country_indeed='USA' # only needed for indeed - - # use if you want to use a proxy - # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", - # offset=25 # use if you want to start at a specific offset ) +print(f"Found {len(jobs)} jobs") +print(jobs.head()) +jobs.to_csv("jobs.csv", index=False) -# formatting for pandas -pd.set_option('display.max_columns', None) -pd.set_option('display.max_rows', None) -pd.set_option('display.width', None) -pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc - -# 1 output to console -print(jobs) - -# 2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) -# display(jobs) - -# 3 output to .csv -# jobs.to_csv('jobs.csv', index=False) - -# 4 output to .xlsx +# output to .xlsx # jobs.to_xlsx('jobs.xlsx', index=False) ``` @@ -117,6 +99,9 @@ JobPost │ ├── max_amount (int) │ └── currency (enum) └── date_posted (date) +└── emails (str) +└── num_urgent_words (int) +└── is_remote (bool) - just for Indeed at the momen ``` ### Exceptions diff --git a/examples/JobSpy_Demo.py b/examples/JobSpy_Demo.py index 598dcd0..c982793 100644 --- a/examples/JobSpy_Demo.py +++ b/examples/JobSpy_Demo.py @@ -6,23 +6,23 @@ jobs: pd.DataFrame = scrape_jobs( search_term="software engineer", location="Dallas, TX", results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) - country_indeed='USA', + country_indeed="USA", offset=25 # start jobs from an offset (use if search failed and want to continue) # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", ) # formatting for pandas -pd.set_option('display.max_columns', None) -pd.set_option('display.max_rows', None) -pd.set_option('display.width', None) -pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc +pd.set_option("display.max_columns", None) +pd.set_option("display.max_rows", None) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", 50) # set to 0 to see full job url / desc # 1: output to console print(jobs) # 2: output to .csv -jobs.to_csv('./jobs.csv', index=False) -print('outputted to jobs.csv') +jobs.to_csv("./jobs.csv", index=False) +print("outputted to jobs.csv") # 3: output to .xlsx # jobs.to_xlsx('jobs.xlsx', index=False) diff --git a/pyproject.toml b/pyproject.toml index 2817eac..b277d13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.12" +version = "1.1.13" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/cullenwatson/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 67e98ff..1bbbaf4 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -26,18 +26,18 @@ def _map_str_to_site(site_name: str) -> Site: def scrape_jobs( - site_name: str | list[str] | Site | list[Site], - search_term: str, - location: str = "", - distance: int = None, - is_remote: bool = False, - job_type: str = None, - easy_apply: bool = False, # linkedin - results_wanted: int = 15, - country_indeed: str = "usa", - hyperlinks: bool = False, - proxy: Optional[str] = None, - offset: Optional[int] = 0 + site_name: str | list[str] | Site | list[Site], + search_term: str, + location: str = "", + distance: int = None, + is_remote: bool = False, + job_type: str = None, + easy_apply: bool = False, # linkedin + results_wanted: int = 15, + country_indeed: str = "usa", + hyperlinks: bool = False, + proxy: Optional[str] = None, + offset: Optional[int] = 0, ) -> pd.DataFrame: """ Simultaneously scrapes job data from multiple job sites. @@ -72,7 +72,7 @@ def scrape_jobs( job_type=job_type, easy_apply=easy_apply, results_wanted=results_wanted, - offset=offset + offset=offset, ) def scrape_site(site: Site) -> Tuple[str, JobResponse]: @@ -120,9 +120,14 @@ def scrape_jobs( ] = f'{job_data["job_url"]}' job_data["site"] = site job_data["company"] = job_data["company_name"] - job_data["job_type"] = ", ".join(job_type.value[0] for job_type in job_data["job_type"]) if job_data[ - "job_type"] else None - job_data["emails"] = ", ".join(job_data["emails"]) if job_data["emails"] else None + job_data["job_type"] = ( + ", ".join(job_type.value[0] for job_type in job_data["job_type"]) + if job_data["job_type"] + else None + ) + job_data["emails"] = ( + ", ".join(job_data["emails"]) if job_data["emails"] else None + ) job_data["location"] = Location(**job_data["location"]).display_location() compensation_obj = job_data.get("compensation") diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 93bdd69..543d6c7 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -189,7 +189,7 @@ class JobPost(BaseModel): benefits: str | None = None emails: list[str] | None = None num_urgent_words: int | None = None - # is_remote: bool | None = None + is_remote: bool | None = None # company_industry: str | None = None diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 8d51761..d52a1d0 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -10,14 +10,13 @@ import io import json from datetime import datetime -import tls_client import urllib.parse from bs4 import BeautifulSoup from bs4.element import Tag from concurrent.futures import ThreadPoolExecutor, Future from ..exceptions import IndeedException -from ..utils import count_urgent_words, extract_emails_from_text +from ..utils import count_urgent_words, extract_emails_from_text, create_session from ...jobs import ( JobPost, Compensation, @@ -43,7 +42,7 @@ class IndeedScraper(Scraper): self.seen_urls = set() def scrape_page( - self, scraper_input: ScraperInput, page: int + self, scraper_input: ScraperInput, page: int ) -> tuple[list[JobPost], int]: """ Scrapes a page of Indeed for jobs with scraper_input criteria @@ -54,7 +53,7 @@ class IndeedScraper(Scraper): self.country = scraper_input.country domain = self.country.domain_value self.url = f"https://{domain}.indeed.com" - session = self.create_session() + session = create_session(self.proxy) params = { "q": scraper_input.search_term, @@ -100,9 +99,9 @@ class IndeedScraper(Scraper): total_num_jobs = IndeedScraper.total_jobs(soup) if ( - not jobs.get("metaData", {}) - .get("mosaicProviderJobCardsModel", {}) - .get("results") + not jobs.get("metaData", {}) + .get("mosaicProviderJobCardsModel", {}) + .get("results") ): raise IndeedException("No jobs found.") @@ -155,8 +154,11 @@ class IndeedScraper(Scraper): compensation=compensation, date_posted=date_posted, job_url=job_url_client, - emails=extract_emails_from_text(description), + emails=extract_emails_from_text(description) if description else None, num_urgent_words=count_urgent_words(description) + if description + else None, + is_remote=self.is_remote_job(job), ) return job_post @@ -177,7 +179,7 @@ class IndeedScraper(Scraper): :return: job_response """ pages_to_process = ( - math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 + math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 ) #: get first page to initialize session @@ -213,7 +215,7 @@ class IndeedScraper(Scraper): params = urllib.parse.parse_qs(parsed_url.query) jk_value = params.get("jk", [None])[0] formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1" - session = self.create_session() + session = create_session(self.proxy) try: response = session.get( @@ -250,7 +252,9 @@ class IndeedScraper(Scraper): label = taxonomy["attributes"][i].get("label") if label: job_type_str = label.replace("-", "").replace(" ", "").lower() - job_types.append(IndeedScraper.get_enum_from_job_type(job_type_str)) + job_types.append( + IndeedScraper.get_enum_from_job_type(job_type_str) + ) return job_types @staticmethod @@ -281,9 +285,9 @@ class IndeedScraper(Scraper): for tag in script_tags: if ( - tag.string - and "mosaic.providerData" in tag.string - and "mosaic-provider-jobcards" in tag.string + tag.string + and "mosaic.providerData" in tag.string + and "mosaic-provider-jobcards" in tag.string ): return tag return None @@ -326,35 +330,26 @@ class IndeedScraper(Scraper): @staticmethod def get_headers(): return { - 'authority': 'www.indeed.com', - 'accept': '*/*', - 'accept-language': 'en-US,en;q=0.9', - 'referer': 'https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw', - 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-origin', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36' + "authority": "www.indeed.com", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "referer": "https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw", + "sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", } - def create_session(self): + @staticmethod + def is_remote_job(job: dict) -> bool: """ - Creates a session with specific client identifiers and assigns proxies if available. - - :return: A session object with or without proxies. + :param job: + :return: bool """ - session = tls_client.Session( - client_identifier="chrome112", - random_tls_extension_order=True, - ) - session.proxies = self.proxy - # TODO multiple proxies - # if self.proxies: - # session.proxies = { - # "http": random.choice(self.proxies), - # "https": random.choice(self.proxies), - # } - - return session + for taxonomy in job.get("taxonomyAttributes", []): + if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0: + return True + return False diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 2e7df9a..c0681d6 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -93,13 +93,15 @@ class LinkedInScraper(Scraper): break except requests.HTTPError as e: - if hasattr(e, 'response') and e.response is not None: + if hasattr(e, "response") and e.response is not None: if e.response.status_code == 429: time.sleep(self.DELAY) retries += 1 continue else: - raise LinkedInException(f"bad response status code: {e.response.status_code}") + raise LinkedInException( + f"bad response status code: {e.response.status_code}" + ) else: raise except ProxyError as e: @@ -108,7 +110,9 @@ class LinkedInScraper(Scraper): raise LinkedInException(str(e)) else: # Raise an exception if the maximum number of retries is reached - raise LinkedInException("Max retries reached, failed to get a valid response") + raise LinkedInException( + "Max retries reached, failed to get a valid response" + ) soup = BeautifulSoup(response.text, "html.parser") @@ -135,7 +139,9 @@ class LinkedInScraper(Scraper): if job_post: job_list.append(job_post) except Exception as e: - raise LinkedInException("Exception occurred while processing jobs") + raise LinkedInException( + "Exception occurred while processing jobs" + ) page += 25 job_list = job_list[: scraper_input.results_wanted] @@ -152,7 +158,11 @@ class LinkedInScraper(Scraper): metadata_card = job_card.find("div", class_="base-search-card__metadata") location = self.get_location(metadata_card) - datetime_tag = metadata_card.find("time", class_="job-search-card__listdate") if metadata_card else None + datetime_tag = ( + metadata_card.find("time", class_="job-search-card__listdate") + if metadata_card + else None + ) date_posted = None if datetime_tag and "datetime" in datetime_tag.attrs: datetime_str = datetime_tag["datetime"] @@ -172,14 +182,16 @@ class LinkedInScraper(Scraper): location=location, date_posted=date_posted, job_url=job_url, + # job_type=[JobType.FULL_TIME], job_type=job_type, benefits=benefits, - emails=extract_emails_from_text(description), - num_urgent_words=count_urgent_words(description) + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, ) - def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[ - str | None, tuple[str | None, JobType | None]]: + def get_job_description( + self, job_page_url: str + ) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]: """ Retrieves job description by going to the job page url :param job_page_url: @@ -201,7 +213,7 @@ class LinkedInScraper(Scraper): description = " ".join(div_content.get_text().split()).strip() def get_job_type( - soup_job_type: BeautifulSoup, + soup_job_type: BeautifulSoup, ) -> list[JobType] | None: """ Gets the job type from job page @@ -233,7 +245,7 @@ class LinkedInScraper(Scraper): def get_enum_from_value(value_str): for job_type in JobType: if value_str in job_type.value: - return list[job_type] + return [job_type] return None def get_location(self, metadata_card: Optional[Tag]) -> Location: diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 0373e66..b9b41a6 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -11,7 +11,6 @@ from datetime import datetime, date from typing import Optional, Tuple, Any from urllib.parse import urlparse, parse_qs, urlunparse -import tls_client import requests from bs4 import BeautifulSoup from bs4.element import Tag @@ -19,7 +18,7 @@ from concurrent.futures import ThreadPoolExecutor, Future from .. import Scraper, ScraperInput, Site from ..exceptions import ZipRecruiterException -from ..utils import count_urgent_words, extract_emails_from_text +from ..utils import count_urgent_words, extract_emails_from_text, create_session from ...jobs import ( JobPost, Compensation, @@ -42,12 +41,9 @@ class ZipRecruiterScraper(Scraper): self.jobs_per_page = 20 self.seen_urls = set() - self.session = tls_client.Session( - client_identifier="chrome112", random_tls_extension_order=True - ) def find_jobs_in_page( - self, scraper_input: ScraperInput, page: int + self, scraper_input: ScraperInput, page: int ) -> list[JobPost]: """ Scrapes a page of ZipRecruiter for jobs with scraper_input criteria @@ -55,14 +51,13 @@ class ZipRecruiterScraper(Scraper): :param page: :return: jobs found on page """ - job_list: list[JobPost] = [] + session = create_session(self.proxy) try: - response = self.session.get( + response = session.get( f"{self.url}/jobs-search", - headers=ZipRecruiterScraper.headers(), - params=ZipRecruiterScraper.add_params(scraper_input, page), + headers=self.headers(), + params=self.add_params(scraper_input, page), allow_redirects=True, - proxy=self.proxy, timeout_seconds=10, ) if response.status_code != 200: @@ -116,7 +111,11 @@ class ZipRecruiterScraper(Scraper): :param scraper_input: :return: job_response """ - start_page = (scraper_input.offset // self.jobs_per_page) + 1 if scraper_input.offset else 1 + start_page = ( + (scraper_input.offset // self.jobs_per_page) + 1 + if scraper_input.offset + else 1 + ) #: get first page to initialize session job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, start_page) pages_to_process = max( @@ -137,92 +136,10 @@ class ZipRecruiterScraper(Scraper): job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) - def process_job_html_1(self, job: Tag) -> Optional[JobPost]: - """ - Parses a job from the job content tag - :param job: BeautifulSoup Tag for one job post - :return JobPost - TODO this method isnt finished due to not encountering this type of html often - """ - job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"]) - if job_url in self.seen_urls: - return None - - title = job.find("h2", {"class": "title"}).text - company = job.find("a", {"class": "company_name"}).text.strip() - - description, updated_job_url = self.get_description(job_url) - # job_url = updated_job_url if updated_job_url else job_url - if description is None: - description = job.find("p", {"class": "job_snippet"}).text.strip() - - job_type_element = job.find("li", {"class": "perk_item perk_type"}) - job_type = None - if job_type_element: - job_type_text = ( - job_type_element.text.strip().lower().replace("_", "").replace(" ", "") - ) - job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text) - - date_posted = ZipRecruiterScraper.get_date_posted(job) - - job_post = JobPost( - title=title, - description=description, - company_name=company, - location=ZipRecruiterScraper.get_location(job), - job_type=job_type, - compensation=ZipRecruiterScraper.get_compensation(job), - date_posted=date_posted, - job_url=job_url, - emails=extract_emails_from_text(description), - num_urgent_words=count_urgent_words(description) - ) - return job_post - - def process_job_html_2(self, job: Tag) -> Optional[JobPost]: - """ - Parses a job from the job content tag for a second variat of HTML that ZR uses - :param job: BeautifulSoup Tag for one job post - :return JobPost - """ - job_url = self.cleanurl(job.find("a", class_="job_link")["href"]) - title = job.find("h2", class_="title").text - company = job.find("a", class_="company_name").text.strip() - - description, updated_job_url = self.get_description(job_url) - # job_url = updated_job_url if updated_job_url else job_url - if description is None: - description = job.find("p", class_="job_snippet").get_text().strip() - - job_type_text = job.find("li", class_="perk_item perk_type") - job_type = None - if job_type_text: - job_type_text = ( - job_type_text.get_text() - .strip() - .lower() - .replace("-", "") - .replace(" ", "") - ) - job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text) - date_posted = ZipRecruiterScraper.get_date_posted(job) - - job_post = JobPost( - title=title, - description=description, - company_name=company, - location=ZipRecruiterScraper.get_location(job), - job_type=job_type, - compensation=ZipRecruiterScraper.get_compensation(job), - date_posted=date_posted, - job_url=job_url, - ) - return job_post - def process_job_javascript(self, job: dict) -> JobPost: + """the most common type of jobs page on ZR""" title = job.get("Title") - job_url = self.cleanurl(job.get("JobURL")) + job_url = job.get("JobURL") description, updated_job_url = self.get_description(job_url) # job_url = updated_job_url if updated_job_url else job_url @@ -276,37 +193,126 @@ class ZipRecruiterScraper(Scraper): return JobPost( title=title, - description=description, company_name=company, location=location, job_type=job_type, compensation=compensation, date_posted=date_posted, job_url=job_url, + description=description, + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, + ) + + def process_job_html_2(self, job: Tag) -> Optional[JobPost]: + """ + second most common type of jobs page on ZR after process_job_javascript() + Parses a job from the job content tag for a second variat of HTML that ZR uses + :param job: BeautifulSoup Tag for one job post + :return JobPost + """ + job_url = job.find("a", class_="job_link")["href"] + title = job.find("h2", class_="title").text + company = job.find("a", class_="company_name").text.strip() + + description, updated_job_url = self.get_description(job_url) + # job_url = updated_job_url if updated_job_url else job_url + if description is None: + description = job.find("p", class_="job_snippet").get_text().strip() + + job_type_text = job.find("li", class_="perk_item perk_type") + job_type = None + if job_type_text: + job_type_text = ( + job_type_text.get_text() + .strip() + .lower() + .replace("-", "") + .replace(" ", "") + ) + job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text) + date_posted = ZipRecruiterScraper.get_date_posted(job) + + job_post = JobPost( + title=title, + company_name=company, + location=ZipRecruiterScraper.get_location(job), + job_type=job_type, + compensation=ZipRecruiterScraper.get_compensation(job), + date_posted=date_posted, + job_url=job_url, + description=description, + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, + ) + return job_post + + def process_job_html_1(self, job: Tag) -> Optional[JobPost]: + """ + TODO this method isnt finished due to not encountering this type of html often + least common type of jobs page on ZR (rarely found) + Parses a job from the job content tag + :param job: BeautifulSoup Tag for one job post + :return JobPost + """ + job_url = job.find("a", {"class": "job_link"})["href"] + # job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"]) + if job_url in self.seen_urls: + return None + + title = job.find("h2", {"class": "title"}).text + company = job.find("a", {"class": "company_name"}).text.strip() + + description, _ = self.get_description(job_url) + # job_url = updated_job_url if updated_job_url else job_url + # get description from jobs listing page if get_description from the specific job page fails + if description is None: + description = job.find("p", {"class": "job_snippet"}).text.strip() + + job_type_element = job.find("li", {"class": "perk_item perk_type"}) + job_type = None + if job_type_element: + job_type_text = ( + job_type_element.text.strip().lower().replace("_", "").replace(" ", "") + ) + job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text) + + date_posted = ZipRecruiterScraper.get_date_posted(job) + + job_post = JobPost( + title=title, + description=description, + company_name=company, + location=ZipRecruiterScraper.get_location(job), + job_type=job_type, + compensation=ZipRecruiterScraper.get_compensation(job), + date_posted=date_posted, + job_url=job_url, + emails=extract_emails_from_text(description), + num_urgent_words=count_urgent_words(description), ) return job_post @staticmethod - def get_job_type_enum(job_type_str: str) -> Optional[list[JobType]]: + def get_job_type_enum(job_type_str: str) -> list[JobType] | None: for job_type in JobType: if job_type_str in job_type.value: return [job_type] return None - def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]: + def get_description(self, job_page_url: str) -> Tuple[str | None, str | None]: """ Retrieves job description by going to the job page url :param job_page_url: - :param session: :return: description or None, response url """ try: - response = requests.get( + session = create_session(self.proxy) + response = session.get( job_page_url, - headers=ZipRecruiterScraper.headers(), + headers=self.headers(), allow_redirects=True, - timeout=5, - proxies=self.proxy, + timeout_seconds=5, ) if response.status_code not in range(200, 400): return None, None @@ -462,8 +468,8 @@ class ZipRecruiterScraper(Scraper): "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" } - @staticmethod - def cleanurl(url): - parsed_url = urlparse(url) - - return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', '')) + # @staticmethod + # def cleanurl(url) -> str: + # parsed_url = urlparse(url) + # + # return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', '')) diff --git a/src/tests/test_all.py b/src/tests/test_all.py index 749be79..5ffd333 100644 --- a/src/tests/test_all.py +++ b/src/tests/test_all.py @@ -9,4 +9,6 @@ def test_all(): results_wanted=5, ) - assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame" diff --git a/src/tests/test_indeed.py b/src/tests/test_indeed.py index 8c6412e..280215f 100644 --- a/src/tests/test_indeed.py +++ b/src/tests/test_indeed.py @@ -7,4 +7,6 @@ def test_indeed(): site_name="indeed", search_term="software engineer", ) - assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame" diff --git a/src/tests/test_linkedin.py b/src/tests/test_linkedin.py index 5814134..8db0a62 100644 --- a/src/tests/test_linkedin.py +++ b/src/tests/test_linkedin.py @@ -7,4 +7,6 @@ def test_linkedin(): site_name="linkedin", search_term="software engineer", ) - assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame" diff --git a/src/tests/test_ziprecruiter.py b/src/tests/test_ziprecruiter.py index f2d2212..cd1c8ee 100644 --- a/src/tests/test_ziprecruiter.py +++ b/src/tests/test_ziprecruiter.py @@ -8,4 +8,6 @@ def test_ziprecruiter(): search_term="software engineer", ) - assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame"