diff --git a/README.md b/README.md index 0a82353..9049ef5 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ Optional ├── job_type (enum): fulltime, parttime, internship, contract ├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] ├── is_remote (bool) +├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower) ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── easy_apply (bool): filters for jobs that are hosted on LinkedIn ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling) diff --git a/pyproject.toml b/pyproject.toml index 06ed984..5c43149 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.34" +version = "1.1.35" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 8214a1f..1a4f66d 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -40,6 +40,7 @@ def scrape_jobs( country_indeed: str = "usa", hyperlinks: bool = False, proxy: Optional[str] = None, + full_description: Optional[bool] = False, offset: Optional[int] = 0, ) -> pd.DataFrame: """ @@ -74,6 +75,7 @@ def scrape_jobs( is_remote=is_remote, job_type=job_type, easy_apply=easy_apply, + full_description=full_description, results_wanted=results_wanted, offset=offset, ) diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 97aaad0..37bd356 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -19,6 +19,7 @@ class ScraperInput(BaseModel): is_remote: bool = False job_type: Optional[JobType] = None easy_apply: bool = None # linkedin + full_description: bool = False offset: int = 0 results_wanted: int = 15 diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 706b3e7..49099b2 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -5,8 +5,12 @@ jobspy.scrapers.glassdoor This module contains routines to scrape Glassdoor. """ import json -from typing import Optional, Any +import requests +from bs4 import BeautifulSoup +from typing import Optional from datetime import datetime, timedelta +from concurrent.futures import ThreadPoolExecutor, as_completed +from ..utils import count_urgent_words, extract_emails_from_text from .. import Scraper, ScraperInput, Site from ..exceptions import GlassdoorException @@ -66,50 +70,70 @@ class GlassdoorScraper(Scraper): jobs_data = res_json["data"]["jobListings"]["jobListings"] jobs = [] - for i, job in enumerate(jobs_data): - job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][ - "linkItems" - ][i]["url"] - if job_url in self.seen_urls: - continue - self.seen_urls.add(job_url) - job = job["jobview"] - title = job["job"]["jobTitleText"] - company_name = job["header"]["employerNameFromSearch"] - location_name = job["header"].get("locationName", "") - location_type = job["header"].get("locationType", "") - age_in_days = job["header"].get("ageInDays") - is_remote, location = False, None - date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None - - if location_type == "S": - is_remote = True - else: - location = self.parse_location(location_name) - - compensation = self.parse_compensation(job["header"]) - - job = JobPost( - title=title, - company_name=company_name, - date_posted=date_posted, - job_url=job_url, - location=location, - compensation=compensation, - is_remote=is_remote - ) - jobs.append(job) + with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: + future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data} + for future in as_completed(future_to_job_data): + job_data = future_to_job_data[future] + try: + job_post = future.result() + if job_post: + jobs.append(job_post) + except Exception as exc: + raise GlassdoorException(f'Glassdoor generated an exception: {exc}') return jobs, self.get_cursor_for_page( res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 ) + def process_job(self, job_data): + """Processes a single job and fetches its description.""" + job_id = job_data["jobview"]["job"]["listingId"] + job_url = f'{self.url}/job-listing/?jl={job_id}' + if job_url in self.seen_urls: + return None + self.seen_urls.add(job_url) + job = job_data["jobview"] + title = job["job"]["jobTitleText"] + company_name = job["header"]["employerNameFromSearch"] + location_name = job["header"].get("locationName", "") + location_type = job["header"].get("locationType", "") + age_in_days = job["header"].get("ageInDays") + is_remote, location = False, None + date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None + + if location_type == "S": + is_remote = True + else: + location = self.parse_location(location_name) + + compensation = self.parse_compensation(job["header"]) + + try: + description = self.fetch_job_description(job_id) + except Exception as e : + description = None + + job_post = JobPost( + title=title, + company_name=company_name, + date_posted=date_posted, + job_url=job_url, + location=location, + compensation=compensation, + is_remote=is_remote, + description=description, + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, + ) + return job_post + def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ Scrapes Glassdoor for jobs with scraper_input criteria. :param scraper_input: Information about job search criteria. :return: JobResponse containing a list of jobs. """ + scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.country = scraper_input.country self.url = self.country.get_url() @@ -143,6 +167,43 @@ class GlassdoorScraper(Scraper): return JobResponse(jobs=all_jobs) + def fetch_job_description(self, job_id): + """Fetches the job description for a single job ID.""" + url = f"{self.url}/graph" + body = [ + { + "operationName": "JobDetailQuery", + "variables": { + "jl": job_id, + "queryString": "q", + "pageTypeEnum": "SERP" + }, + "query": """ + query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) { + jobview: jobView( + listingId: $jl + contextHolder: {queryString: $queryString, pageTypeEnum: $pageTypeEnum} + ) { + job { + description + __typename + } + __typename + } + } + """ + } + ] + response = requests.post(url, json=body, headers=GlassdoorScraper.headers()) + if response.status_code != 200: + return None + data = response.json()[0] + desc = data['data']['jobview']['job']['description'] + soup = BeautifulSoup(desc, 'html.parser') + description = soup.get_text(separator='\n') + + return description + @staticmethod def parse_compensation(data: dict) -> Optional[Compensation]: pay_period = data.get("payPeriod") diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 00f7717..ef7d3f2 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -78,7 +78,7 @@ class IndeedScraper(Scraper): if sc_values: params["sc"] = "0kf:" + "".join(sc_values) + ";" try: - session = create_session(self.proxy, is_tls=True) + session = create_session(self.proxy) response = session.get( f"{self.url}/jobs", headers=self.get_headers(), @@ -140,7 +140,8 @@ class IndeedScraper(Scraper): date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = date_posted.strftime("%Y-%m-%d") - description = self.get_description(job_url) + description = self.get_description(job_url) if scraper_input.full_description else None + with io.StringIO(job["snippet"]) as f: soup_io = BeautifulSoup(f, "html.parser") li_elements = soup_io.find_all("li") @@ -246,7 +247,7 @@ class IndeedScraper(Scraper): return None soup = BeautifulSoup(job_description, "html.parser") - text_content = " ".join(soup.get_text(separator=" ").split()).strip() + text_content = "\n".join(soup.stripped_strings) return text_content diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 03c0cfb..882ee1d 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -111,7 +111,7 @@ class LinkedInScraper(Scraper): # Call process_job directly without threading try: - job_post = self.process_job(job_card, job_url) + job_post = self.process_job(job_card, job_url, scraper_input.full_description) if job_post: job_list.append(job_post) except Exception as e: @@ -123,7 +123,7 @@ class LinkedInScraper(Scraper): job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) - def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]: + def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]: salary_tag = job_card.find('span', class_='job-search-card__salary-info') compensation = None @@ -160,7 +160,7 @@ class LinkedInScraper(Scraper): if metadata_card else None ) - date_posted = None + date_posted = description = job_type = None if datetime_tag and "datetime" in datetime_tag.attrs: datetime_str = datetime_tag["datetime"] try: @@ -169,9 +169,8 @@ class LinkedInScraper(Scraper): date_posted = None benefits_tag = job_card.find("span", class_="result-benefits__text") benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None - - # removed to speed up scraping - # description, job_type = self.get_job_description(job_url) + if full_descr: + description, job_type = self.get_job_description(job_url) return JobPost( title=title, @@ -182,10 +181,10 @@ class LinkedInScraper(Scraper): job_url=job_url, compensation=compensation, benefits=benefits, - # job_type=job_type, - # description=description, - # emails=extract_emails_from_text(description) if description else None, - # num_urgent_words=count_urgent_words(description) if description else None, + job_type=job_type, + description=description, + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, ) def get_job_description( @@ -214,7 +213,7 @@ class LinkedInScraper(Scraper): description = None if div_content: - description = " ".join(div_content.get_text().split()).strip() + description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip()) def get_job_type( soup_job_type: BeautifulSoup, diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 8e9205f..df75be5 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -109,7 +109,7 @@ class ZipRecruiterScraper(Scraper): description = BeautifulSoup( job.get("job_description", "").strip(), "html.parser" - ).get_text() + ).get_text(separator="\n") company = job["hiring_company"].get("name") if "hiring_company" in job else None country_value = "usa" if job.get("job_country") == "US" else "canada"