""" jobspy.scrapers.google ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape Glassdoor. """ from __future__ import annotations import math import re import json from typing import Tuple from datetime import datetime, timedelta from .constants import headers_jobs, headers_initial, async_param from .. import Scraper, ScraperInput, Site from ..utils import extract_emails_from_text, create_logger, extract_job_type from ..utils import ( create_session, ) from ...jobs import ( JobPost, JobResponse, Location, JobType, ) logger = create_logger("Google") class GoogleJobsScraper(Scraper): def __init__( self, proxies: list[str] | str | None = None, ca_cert: str | None = None ): """ Initializes GlassdoorScraper with the Glassdoor job search url """ site = Site(Site.GOOGLE) super().__init__(site, proxies=proxies, ca_cert=ca_cert) self.base_url = None self.country = None self.session = None self.scraper_input = None self.jobs_per_page = 10 self.seen_urls = set() self.url = "https://www.google.com/search" self.jobs_url = "https://www.google.com/async/callback:550" def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ Scrapes Glassdoor for jobs with scraper_input criteria. :param scraper_input: Information about job search criteria. :return: JobResponse containing a list of jobs. """ self.scraper_input = scraper_input self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.base_url = self.scraper_input.country.get_glassdoor_url() self.session = create_session( proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True ) forward_cursor = self._get_initial_cursor() if forward_cursor is None: logger.error("initial cursor not found") return JobResponse(jobs=[]) page = 1 job_list: list[JobPost] = [] while ( len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset and forward_cursor ): logger.info( f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" ) jobs, forward_cursor = self._get_jobs_next_page(forward_cursor) if not jobs: logger.info(f"found no jobs on page: {page}") break job_list += jobs page += 1 return JobResponse( jobs=job_list[ scraper_input.offset : scraper_input.offset + scraper_input.results_wanted ] ) def _get_initial_cursor(self): """Gets initial cursor to paginate through job listings""" query = f"{self.scraper_input.search_term} jobs" def get_time_range(hours_old): if hours_old <= 24: return "since yesterday" elif hours_old <= 72: return "in the last 3 days" elif hours_old <= 168: return "in the last week" else: return "in the last month" job_type_mapping = { JobType.FULL_TIME: "Full time", JobType.PART_TIME: "Part time", JobType.INTERNSHIP: "Internship", JobType.CONTRACT: "Contract", } if self.scraper_input.job_type in job_type_mapping: query += f" {job_type_mapping[self.scraper_input.job_type]}" if self.scraper_input.location: query += f" near {self.scraper_input.location}" if self.scraper_input.hours_old: time_filter = get_time_range(self.scraper_input.hours_old) query += f" {time_filter}" if self.scraper_input.is_remote: query += " remote" params = {"q": query, "udm": "8"} response = self.session.get(self.url, headers=headers_initial, params=params) pattern_fc = r'