From f6248c8386ce9edd90bd3c068e7b2f661838a568 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Thu, 24 Oct 2024 15:19:40 -0500 Subject: [PATCH] enh: google jobs (#214) --- README.md | 14 +- pyproject.toml | 2 +- src/jobspy/__init__.py | 7 +- src/jobspy/jobs/__init__.py | 2 +- src/jobspy/scrapers/__init__.py | 7 +- src/jobspy/scrapers/exceptions.py | 5 + src/jobspy/scrapers/glassdoor/__init__.py | 2 +- src/jobspy/scrapers/google/__init__.py | 217 ++++++++++++++++++++++ src/jobspy/scrapers/google/constants.py | 52 ++++++ src/jobspy/scrapers/indeed/__init__.py | 4 +- src/jobspy/scrapers/linkedin/__init__.py | 6 +- src/jobspy/scrapers/utils.py | 19 ++ tests/test_google.py | 12 ++ 13 files changed, 331 insertions(+), 18 deletions(-) create mode 100644 src/jobspy/scrapers/google/__init__.py create mode 100644 src/jobspy/scrapers/google/constants.py create mode 100644 tests/test_google.py diff --git a/README.md b/README.md index 391bcc6..6626639 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ work with us.* ## Features -- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously +- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously - Aggregates the job postings in a Pandas DataFrame - Proxies support @@ -30,9 +30,9 @@ import csv from jobspy import scrape_jobs jobs = scrape_jobs( - site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"], + site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"], search_term="software engineer", - location="Dallas, TX", + location="San Francisco, CA", results_wanted=20, hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old) country_indeed='USA', # only needed for indeed / glassdoor @@ -80,9 +80,6 @@ Optional | in format ['user:pass@host:port', 'localhost'] | each job board scraper will round robin through the proxies | -├── ca_cert (str) -| path to CA Certificate file for proxies -│ ├── is_remote (bool) │ ├── results_wanted (int): @@ -116,6 +113,9 @@ Optional | ├── enforce_annual_salary (bool): | converts wages to annual salary +| +├── ca_cert (str) +| path to CA Certificate file for proxies ``` ``` @@ -168,7 +168,7 @@ Indeed specific ├── company_employees_label ├── company_revenue_label ├── company_description -└── logo_photo_url +└── company_logo ``` ## Supported Countries for Job Searching diff --git a/pyproject.toml b/pyproject.toml index d443829..b74755a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.72" +version = "1.1.73" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index f9f02ad..6c8573b 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -9,6 +9,7 @@ from .scrapers.utils import set_logger_level, extract_salary, create_logger from .scrapers.indeed import IndeedScraper from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.glassdoor import GlassdoorScraper +from .scrapers.google import GoogleJobsScraper from .scrapers.linkedin import LinkedInScraper from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country from .scrapers.exceptions import ( @@ -16,6 +17,7 @@ from .scrapers.exceptions import ( IndeedException, ZipRecruiterException, GlassdoorException, + GoogleJobsException, ) @@ -50,6 +52,7 @@ def scrape_jobs( Site.INDEED: IndeedScraper, Site.ZIP_RECRUITER: ZipRecruiterScraper, Site.GLASSDOOR: GlassdoorScraper, + Site.GOOGLE: GoogleJobsScraper, } set_logger_level(verbose) @@ -223,12 +226,12 @@ def scrape_jobs( "is_remote", "job_level", "job_function", - "company_industry", "listing_type", "emails", "description", + "company_industry", "company_url", - "logo_photo_url", + "company_logo", "company_url_direct", "company_addresses", "company_num_employees", diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 48ef824..c51839c 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -256,7 +256,7 @@ class JobPost(BaseModel): company_num_employees: str | None = None company_revenue: str | None = None company_description: str | None = None - logo_photo_url: str | None = None + company_logo: str | None = None banner_photo_url: str | None = None # linkedin only atm diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 8ca0539..492fd77 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -17,11 +17,14 @@ class Site(Enum): INDEED = "indeed" ZIP_RECRUITER = "zip_recruiter" GLASSDOOR = "glassdoor" + GOOGLE = "google" + class SalarySource(Enum): DIRECT_DATA = "direct_data" DESCRIPTION = "description" + class ScraperInput(BaseModel): site_type: list[Site] search_term: str | None = None @@ -42,7 +45,9 @@ class ScraperInput(BaseModel): class Scraper(ABC): - def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None): + def __init__( + self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None + ): self.site = site self.proxies = proxies self.ca_cert = ca_cert diff --git a/src/jobspy/scrapers/exceptions.py b/src/jobspy/scrapers/exceptions.py index e49680b..eba0479 100644 --- a/src/jobspy/scrapers/exceptions.py +++ b/src/jobspy/scrapers/exceptions.py @@ -24,3 +24,8 @@ class ZipRecruiterException(Exception): class GlassdoorException(Exception): def __init__(self, message=None): super().__init__(message or "An error occurred with Glassdoor") + + +class GoogleJobsException(Exception): + def __init__(self, message=None): + super().__init__(message or "An error occurred with Google Jobs") diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index eab4ee5..d2de6dc 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -214,7 +214,7 @@ class GlassdoorScraper(Scraper): is_remote=is_remote, description=description, emails=extract_emails_from_text(description) if description else None, - logo_photo_url=company_logo, + company_logo=company_logo, listing_type=listing_type, ) diff --git a/src/jobspy/scrapers/google/__init__.py b/src/jobspy/scrapers/google/__init__.py new file mode 100644 index 0000000..4f8ecbd --- /dev/null +++ b/src/jobspy/scrapers/google/__init__.py @@ -0,0 +1,217 @@ +""" +jobspy.scrapers.google +~~~~~~~~~~~~~~~~~~~ + +This module contains routines to scrape Glassdoor. +""" + +from __future__ import annotations + +import math +import re +import json +from typing import Tuple +from datetime import datetime, timedelta + +from .constants import headers_jobs, headers_initial, async_param +from .. import Scraper, ScraperInput, Site +from ..utils import extract_emails_from_text, create_logger, extract_job_type +from ..utils import ( + create_session, +) +from ...jobs import ( + JobPost, + JobResponse, + Location, + JobType, +) + +logger = create_logger("Google") + + +class GoogleJobsScraper(Scraper): + def __init__( + self, proxies: list[str] | str | None = None, ca_cert: str | None = None + ): + """ + Initializes GlassdoorScraper with the Glassdoor job search url + """ + site = Site(Site.GOOGLE) + super().__init__(site, proxies=proxies, ca_cert=ca_cert) + + self.base_url = None + self.country = None + self.session = None + self.scraper_input = None + self.jobs_per_page = 10 + self.seen_urls = set() + self.url = "https://www.google.com/search" + self.jobs_url = "https://www.google.com/async/callback:550" + + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + """ + Scrapes Glassdoor for jobs with scraper_input criteria. + :param scraper_input: Information about job search criteria. + :return: JobResponse containing a list of jobs. + """ + self.scraper_input = scraper_input + self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) + self.base_url = self.scraper_input.country.get_glassdoor_url() + + self.session = create_session( + proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True + ) + forward_cursor = self._get_initial_cursor() + if forward_cursor is None: + logger.error("initial cursor not found") + return JobResponse(jobs=[]) + + page = 1 + job_list: list[JobPost] = [] + + while ( + len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset + and forward_cursor + ): + logger.info( + f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" + ) + jobs, forward_cursor = self._get_jobs_next_page(forward_cursor) + if not jobs: + logger.info(f"found no jobs on page: {page}") + break + job_list += jobs + page += 1 + return JobResponse( + jobs=job_list[ + scraper_input.offset : scraper_input.offset + + scraper_input.results_wanted + ] + ) + + def _get_initial_cursor(self): + """Gets initial cursor to paginate through job listings""" + query = f"{self.scraper_input.search_term} jobs" + + def get_time_range(hours_old): + if hours_old <= 24: + return "since yesterday" + elif hours_old <= 72: + return "in the last 3 days" + elif hours_old <= 168: + return "in the last week" + else: + return "in the last month" + + job_type_mapping = { + JobType.FULL_TIME: "Full time", + JobType.PART_TIME: "Part time", + JobType.INTERNSHIP: "Internship", + JobType.CONTRACT: "Contract", + } + + if self.scraper_input.job_type in job_type_mapping: + query += f" {job_type_mapping[self.scraper_input.job_type]}" + + if self.scraper_input.location: + query += f" near {self.scraper_input.location}" + + if self.scraper_input.hours_old: + time_filter = get_time_range(self.scraper_input.hours_old) + query += f" {time_filter}" + + if self.scraper_input.is_remote: + query += " remote" + + params = {"q": query, "udm": "8"} + response = self.session.get(self.url, headers=headers_initial, params=params) + + pattern_fc = r'
]+data-async-fc="([^"]+)"' + match_fc = re.search(pattern_fc, response.text) + data_async_fc = match_fc.group(1) if match_fc else None + return data_async_fc + + def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]: + params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]} + response = self.session.get(self.jobs_url, headers=headers_jobs, params=params) + return self._parse_jobs(response.text) + + def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]: + """ + Parses jobs on a page with next page cursor + """ + start_idx = job_data.find("[[[") + end_idx = job_data.rindex("]]]") + 3 + s = job_data[start_idx:end_idx] + parsed = json.loads(s)[0] + + pattern_fc = r'data-async-fc="([^"]+)"' + match_fc = re.search(pattern_fc, job_data) + data_async_fc = match_fc.group(1) if match_fc else None + jobs_on_page = [] + + for array in parsed: + + _, job_data = array + if not job_data.startswith("[[["): + continue + job_d = json.loads(job_data) + + job_info = self._find_job_info(job_d) + + job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None + if job_url in self.seen_urls: + continue + self.seen_urls.add(job_url) + + title = job_info[0] + company_name = job_info[1] + location = city = job_info[2] + state = country = date_posted = None + if location and "," in location: + city, state, *country = [*map(lambda x: x.strip(), location.split(","))] + + days_ago_str = job_info[12] + if type(days_ago_str) == str: + match = re.search(r"\d+", days_ago_str) + days_ago = int(match.group()) if match else None + date_posted = (datetime.now() - timedelta(days=days_ago)).date() + + description = job_info[19] + + job_post = JobPost( + id=f"go-{job_info[28]}", + title=title, + company_name=company_name, + location=Location( + city=city, state=state, country=country[0] if country else None + ), + job_url=job_url, + job_url_direct=job_url, + date_posted=date_posted, + is_remote="remote" in description.lower() + or "wfh" in description.lower(), + description=description, + emails=extract_emails_from_text(description), + job_type=extract_job_type(description), + ) + jobs_on_page.append(job_post) + return jobs_on_page, data_async_fc + + @staticmethod + def _find_job_info(jobs_data: list | dict) -> list | None: + """Iterates through the JSON data to find the job listings""" + if isinstance(jobs_data, dict): + for key, value in jobs_data.items(): + if key == "520084652" and isinstance(value, list): + return value + else: + result = GoogleJobsScraper._find_job_info(value) + if result: + return result + elif isinstance(jobs_data, list): + for item in jobs_data: + result = GoogleJobsScraper._find_job_info(item) + if result: + return result + return None diff --git a/src/jobspy/scrapers/google/constants.py b/src/jobspy/scrapers/google/constants.py new file mode 100644 index 0000000..a0d13b0 --- /dev/null +++ b/src/jobspy/scrapers/google/constants.py @@ -0,0 +1,52 @@ +headers_initial = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-US,en;q=0.9", + "priority": "u=0, i", + "referer": "https://www.google.com/", + "sec-ch-prefers-color-scheme": "dark", + "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"', + "sec-ch-ua-arch": '"arm"', + "sec-ch-ua-bitness": '"64"', + "sec-ch-ua-form-factors": '"Desktop"', + "sec-ch-ua-full-version": '"130.0.6723.58"', + "sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-model": '""', + "sec-ch-ua-platform": '"macOS"', + "sec-ch-ua-platform-version": '"15.0.1"', + "sec-ch-ua-wow64": "?0", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + "x-browser-channel": "stable", + "x-browser-copyright": "Copyright 2024 Google LLC. All rights reserved.", + "x-browser-year": "2024", +} + +headers_jobs = { + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "priority": "u=1, i", + "referer": "https://www.google.com/", + "sec-ch-prefers-color-scheme": "dark", + "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"', + "sec-ch-ua-arch": '"arm"', + "sec-ch-ua-bitness": '"64"', + "sec-ch-ua-form-factors": '"Desktop"', + "sec-ch-ua-full-version": '"130.0.6723.58"', + "sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-model": '""', + "sec-ch-ua-platform": '"macOS"', + "sec-ch-ua-platform-version": '"15.0.1"', + "sec-ch-ua-wow64": "?0", + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", +} + +async_param = "_basejs:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/am=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAACAAAoICAAAAAAAKMAfAAAAIAQAAAAAAAAAAAAACCAAAEJDAAACAAAAAGABAIAAARBAAABAAAAAgAgQAABAASKAfv8JAAABAAAAAAwAQAQACQAAAAAAcAEAQABoCAAAABAAAIABAACAAAAEAAAAFAAAAAAAAAAAAAAAAAAAAAAAAACAQADoBwAAAAAAAAAAAAAQBAAAAATQAAoACOAHAAAAAAAAAQAAAIIAAAA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/dg=0/br=1/rs=ACT90oGxMeaFMCopIHq5tuQM-6_3M_VMjQ,_basecss:/xjs/_/ss/k=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAIAIAIAoEwCAADIC8AfsgEAawwAPkAAjgoAGAAAAAAAAEADAAAAAAIgAECHAAAAAAAAAAABAQAggAARQAAAQCEAAAAAIAAAABgAAAAAIAQIACCAAfB-AAFIQABoCEA_CgEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAAAAQEAAABAgAMCPAAA4AoE2BAEAggSAAIoAQAAAAAgAAAAACCAQAAAxEwA_ZAACAAAAAAAAAAkAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAQAEAAAAAAAAAAAAAAAAAAAAAQA/br=1/rs=ACT90oGZc36t3uUQkj0srnIvvbHjO2hgyg,_basecomb:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/ck=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAKAIAoIqEwCAADIK8AfsgEAawwAPkAAjgoAGAAACCAAAEJDAAACAAIgAGCHAIAAARBAAABBAQAggAgRQABAQSOAfv8JIAABABgAAAwAYAQICSCAAfB-cAFIQABoCEA_ChEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAACAQEDoBxAgAMCPAAA4AoE2BAEAggTQAIoASOAHAAgAAAAACSAQAIIxEwA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/d=1/ed=1/dg=0/br=1/ujg=1/rs=ACT90oFNLTjPzD_OAqhhtXwe2pg1T3WpBg,_fmt:prog,_id:fc_5FwaZ86OKsfdwN4P4La3yA4_2" diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index f3f679c..bd379ab 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -72,7 +72,7 @@ class IndeedScraper(Scraper): while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset: logger.info( - f"search page: {page} / {math.ceil(scraper_input.results_wanted / 100)}" + f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" ) jobs, cursor = self._scrape_page(cursor) if not jobs: @@ -258,7 +258,7 @@ class IndeedScraper(Scraper): company_num_employees=employer_details.get("employeesLocalizedLabel"), company_revenue=employer_details.get("revenueLocalizedLabel"), company_description=employer_details.get("briefDescription"), - logo_photo_url=( + company_logo=( employer["images"].get("squareLogoUrl") if employer and employer.get("images") else None diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index f6bc63b..c3629f6 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -232,7 +232,7 @@ class LinkedInScraper(Scraper): description=job_details.get("description"), job_url_direct=job_details.get("job_url_direct"), emails=extract_emails_from_text(job_details.get("description")), - logo_photo_url=job_details.get("logo_photo_url"), + company_logo=job_details.get("company_logo"), job_function=job_details.get("job_function"), ) @@ -275,7 +275,7 @@ class LinkedInScraper(Scraper): if job_function_span: job_function = job_function_span.text.strip() - logo_photo_url = ( + company_logo = ( logo_image.get("data-delayed-url") if (logo_image := soup.find("img", {"class": "artdeco-entity-image"})) else None @@ -286,7 +286,7 @@ class LinkedInScraper(Scraper): "company_industry": self._parse_company_industry(soup), "job_type": self._parse_job_type(soup), "job_url_direct": self._parse_job_url_direct(soup), - "logo_photo_url": logo_photo_url, + "company_logo": company_logo, "job_function": job_function, } diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 760d52c..7c032d7 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -264,3 +264,22 @@ def extract_salary( else: return interval, min_salary, max_salary, "USD" return None, None, None, None + + +def extract_job_type(description: str): + if not description: + return [] + + keywords = { + JobType.FULL_TIME: r"full\s?time", + JobType.PART_TIME: r"part\s?time", + JobType.INTERNSHIP: r"internship", + JobType.CONTRACT: r"contract", + } + + listing_types = [] + for key, pattern in keywords.items(): + if re.search(pattern, description, re.IGNORECASE): + listing_types.append(key) + + return listing_types if listing_types else None diff --git a/tests/test_google.py b/tests/test_google.py new file mode 100644 index 0000000..9f30ffe --- /dev/null +++ b/tests/test_google.py @@ -0,0 +1,12 @@ +from jobspy import scrape_jobs +import pandas as pd + + +def test_google(): + result = scrape_jobs( + site_name="google", search_term="software engineer", results_wanted=5 + ) + + assert ( + isinstance(result, pd.DataFrame) and len(result) == 5 + ), "Result should be a non-empty DataFrame"