From 338d854b96612f830a8756fddba1d88c5455fc18 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 25 Oct 2024 14:54:14 -0500 Subject: [PATCH] fix(google): search (#216) --- README.md | 8 +- pyproject.toml | 2 +- src/jobspy/__init__.py | 8 +- src/jobspy/scrapers/__init__.py | 1 + src/jobspy/scrapers/google/__init__.py | 131 ++++++++++++++++--------- 5 files changed, 97 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 7d50e98..3a86806 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ from jobspy import scrape_jobs jobs = scrape_jobs( site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"], search_term="software engineer", + google_search_term="software engineer jobs near San Francisco, CA since yesterday", location="San Francisco, CA", results_wanted=20, hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old) @@ -65,6 +66,9 @@ Optional | (default is all) │ ├── search_term (str) +| +├── google_search_term (str) +| search term for google jobs. This is is only param for filtering google jobs. │ ├── location (str) │ @@ -171,9 +175,9 @@ Indeed specific ## Supported Countries for Job Searching -### **LinkedIn / Google** +### **LinkedIn** -LinkedIn & Google searches globally & uses only the `location` parameter. +LinkedIn searches globally & uses only the `location` parameter. ### **ZipRecruiter** diff --git a/pyproject.toml b/pyproject.toml index 494d42b..5e684fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.74" +version = "1.1.75" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 6c8573b..0ad21b8 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -24,6 +24,7 @@ from .scrapers.exceptions import ( def scrape_jobs( site_name: str | list[str] | Site | list[Site] | None = None, search_term: str | None = None, + google_search_term: str | None = None, location: str | None = None, distance: int | None = 50, is_remote: bool = False, @@ -86,6 +87,7 @@ def scrape_jobs( site_type=get_site_type(), country=country_enum, search_term=search_term, + google_search_term=google_search_term, location=location, distance=distance, is_remote=is_remote, @@ -216,8 +218,8 @@ def scrape_jobs( "title", "company", "location", - "job_type", "date_posted", + "job_type", "salary_source", "interval", "min_amount", @@ -248,6 +250,8 @@ def scrape_jobs( jobs_df = jobs_df[desired_order] # Step 4: Sort the DataFrame as required - return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False]) + return jobs_df.sort_values( + by=["site", "date_posted"], ascending=[True, False] + ).reset_index(drop=True) else: return pd.DataFrame() diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 492fd77..25c0841 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -28,6 +28,7 @@ class SalarySource(Enum): class ScraperInput(BaseModel): site_type: list[Site] search_term: str | None = None + google_search_term: str | None = None location: str | None = None country: Country | None = Country.USA diff --git a/src/jobspy/scrapers/google/__init__.py b/src/jobspy/scrapers/google/__init__.py index e074cd9..e6a2c57 100644 --- a/src/jobspy/scrapers/google/__init__.py +++ b/src/jobspy/scrapers/google/__init__.py @@ -59,13 +59,14 @@ class GoogleJobsScraper(Scraper): self.session = create_session( proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True ) - forward_cursor = self._get_initial_cursor() + forward_cursor, job_list = self._get_initial_cursor_and_jobs() if forward_cursor is None: - logger.error("initial cursor not found") - return JobResponse(jobs=[]) + logger.warning( + "initial cursor not found, try changing your query or there was at most 10 results" + ) + return JobResponse(jobs=job_list) page = 1 - job_list: list[JobPost] = [] while ( len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset @@ -74,7 +75,11 @@ class GoogleJobsScraper(Scraper): logger.info( f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" ) - jobs, forward_cursor = self._get_jobs_next_page(forward_cursor) + try: + jobs, forward_cursor = self._get_jobs_next_page(forward_cursor) + except Exception as e: + logger.error(f"failed to get jobs on page: {page}, {e}") + break if not jobs: logger.info(f"found no jobs on page: {page}") break @@ -87,8 +92,8 @@ class GoogleJobsScraper(Scraper): ] ) - def _get_initial_cursor(self): - """Gets initial cursor to paginate through job listings""" + def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]: + """Gets initial cursor and jobs to paginate through job listings""" query = f"{self.scraper_input.search_term} jobs" def get_time_range(hours_old): @@ -121,13 +126,22 @@ class GoogleJobsScraper(Scraper): if self.scraper_input.is_remote: query += " remote" + if self.scraper_input.google_search_term: + query = self.scraper_input.google_search_term + params = {"q": query, "udm": "8"} response = self.session.get(self.url, headers=headers_initial, params=params) pattern_fc = r'
]+data-async-fc="([^"]+)"' match_fc = re.search(pattern_fc, response.text) data_async_fc = match_fc.group(1) if match_fc else None - return data_async_fc + jobs_raw = self._find_job_info_initial_page(response.text) + jobs = [] + for job_raw in jobs_raw: + job_post = self._parse_job(job_raw) + if job_post: + jobs.append(job_post) + return data_async_fc, jobs def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]: params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]} @@ -147,55 +161,55 @@ class GoogleJobsScraper(Scraper): match_fc = re.search(pattern_fc, job_data) data_async_fc = match_fc.group(1) if match_fc else None jobs_on_page = [] - for array in parsed: - _, job_data = array if not job_data.startswith("[[["): continue job_d = json.loads(job_data) job_info = self._find_job_info(job_d) - - job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None - if job_url in self.seen_urls: - continue - self.seen_urls.add(job_url) - - title = job_info[0] - company_name = job_info[1] - location = city = job_info[2] - state = country = date_posted = None - if location and "," in location: - city, state, *country = [*map(lambda x: x.strip(), location.split(","))] - - days_ago_str = job_info[12] - if type(days_ago_str) == str: - match = re.search(r"\d+", days_ago_str) - days_ago = int(match.group()) if match else None - date_posted = (datetime.now() - timedelta(days=days_ago)).date() - - description = job_info[19] - - job_post = JobPost( - id=f"go-{job_info[28]}", - title=title, - company_name=company_name, - location=Location( - city=city, state=state, country=country[0] if country else None - ), - job_url=job_url, - job_url_direct=job_url, - date_posted=date_posted, - is_remote="remote" in description.lower() - or "wfh" in description.lower(), - description=description, - emails=extract_emails_from_text(description), - job_type=extract_job_type(description), - ) - jobs_on_page.append(job_post) + job_post = self._parse_job(job_info) + if job_post: + jobs_on_page.append(job_post) return jobs_on_page, data_async_fc + def _parse_job(self, job_info: list): + job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None + if job_url in self.seen_urls: + return + self.seen_urls.add(job_url) + + title = job_info[0] + company_name = job_info[1] + location = city = job_info[2] + state = country = date_posted = None + if location and "," in location: + city, state, *country = [*map(lambda x: x.strip(), location.split(","))] + + days_ago_str = job_info[12] + if type(days_ago_str) == str: + match = re.search(r"\d+", days_ago_str) + days_ago = int(match.group()) if match else None + date_posted = (datetime.now() - timedelta(days=days_ago)).date() + + description = job_info[19] + + job_post = JobPost( + id=f"go-{job_info[28]}", + title=title, + company_name=company_name, + location=Location( + city=city, state=state, country=country[0] if country else None + ), + job_url=job_url, + date_posted=date_posted, + is_remote="remote" in description.lower() or "wfh" in description.lower(), + description=description, + emails=extract_emails_from_text(description), + job_type=extract_job_type(description), + ) + return job_post + @staticmethod def _find_job_info(jobs_data: list | dict) -> list | None: """Iterates through the JSON data to find the job listings""" @@ -213,3 +227,24 @@ class GoogleJobsScraper(Scraper): if result: return result return None + + @staticmethod + def _find_job_info_initial_page(html_text: str): + pattern = ( + f'520084652":(' + + r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])" + ) + results = [] + matches = re.finditer(pattern, html_text) + + import json + + for match in matches: + try: + parsed_data = json.loads(match.group(1)) + results.append(parsed_data) + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse match: {str(e)}") + results.append({"raw_match": match.group(0), "error": str(e)}) + return results