Compare commits

...

8 Commits

Author SHA1 Message Date
Cullen Watson
338d854b96 fix(google): search (#216) 2024-10-25 14:54:14 -05:00
Cullen Watson
811d4c40b4 chore:version 2024-10-24 15:28:25 -05:00
Cullen Watson
dba92d22c2 chore:version 2024-10-24 15:27:16 -05:00
Cullen Watson
10a3592a0f docs:file 2024-10-24 15:26:49 -05:00
Cullen Watson
b7905cc756 docs:file 2024-10-24 15:24:18 -05:00
Cullen Watson
6867d58829 docs:readme 2024-10-24 15:22:31 -05:00
Cullen Watson
f6248c8386 enh: google jobs (#214) 2024-10-24 15:19:40 -05:00
Cullen Watson
f395597fdd fix(indeed): offset 2024-10-22 19:25:07 -05:00
13 changed files with 384 additions and 33 deletions

View File

@@ -2,14 +2,12 @@
**JobSpy** is a simple, yet comprehensive, job scraping library. **JobSpy** is a simple, yet comprehensive, job scraping library.
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to *Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.* work with us.*
## Features ## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame - Aggregates the job postings in a Pandas DataFrame
- Proxies support - Proxies support
@@ -30,9 +28,10 @@ import csv
from jobspy import scrape_jobs from jobspy import scrape_jobs
jobs = scrape_jobs( jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"], site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
search_term="software engineer", search_term="software engineer",
location="Dallas, TX", google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA",
results_wanted=20, results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old) hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor country_indeed='USA', # only needed for indeed / glassdoor
@@ -63,10 +62,13 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext ```plaintext
Optional Optional
├── site_name (list|str): ├── site_name (list|str):
| linkedin, zip_recruiter, indeed, glassdoor | linkedin, zip_recruiter, indeed, glassdoor, google
| (default is all four) | (default is all)
├── search_term (str) ├── search_term (str)
|
├── google_search_term (str)
| search term for google jobs. This is is only param for filtering google jobs.
├── location (str) ├── location (str)
@@ -80,9 +82,6 @@ Optional
| in format ['user:pass@host:port', 'localhost'] | in format ['user:pass@host:port', 'localhost']
| each job board scraper will round robin through the proxies | each job board scraper will round robin through the proxies
| |
├── ca_cert (str)
| path to CA Certificate file for proxies
├── is_remote (bool) ├── is_remote (bool)
├── results_wanted (int): ├── results_wanted (int):
@@ -116,6 +115,9 @@ Optional
| |
├── enforce_annual_salary (bool): ├── enforce_annual_salary (bool):
| converts wages to annual salary | converts wages to annual salary
|
├── ca_cert (str)
| path to CA Certificate file for proxies
``` ```
``` ```
@@ -168,7 +170,7 @@ Indeed specific
├── company_employees_label ├── company_employees_label
├── company_revenue_label ├── company_revenue_label
├── company_description ├── company_description
└── logo_photo_url └── company_logo
``` ```
## Supported Countries for Job Searching ## Supported Countries for Job Searching

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.71" version = "1.1.75"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -9,6 +9,7 @@ from .scrapers.utils import set_logger_level, extract_salary, create_logger
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import ( from .scrapers.exceptions import (
@@ -16,12 +17,14 @@ from .scrapers.exceptions import (
IndeedException, IndeedException,
ZipRecruiterException, ZipRecruiterException,
GlassdoorException, GlassdoorException,
GoogleJobsException,
) )
def scrape_jobs( def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None, site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None, search_term: str | None = None,
google_search_term: str | None = None,
location: str | None = None, location: str | None = None,
distance: int | None = 50, distance: int | None = 50,
is_remote: bool = False, is_remote: bool = False,
@@ -50,6 +53,7 @@ def scrape_jobs(
Site.INDEED: IndeedScraper, Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper, Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper, Site.GLASSDOOR: GlassdoorScraper,
Site.GOOGLE: GoogleJobsScraper,
} }
set_logger_level(verbose) set_logger_level(verbose)
@@ -83,6 +87,7 @@ def scrape_jobs(
site_type=get_site_type(), site_type=get_site_type(),
country=country_enum, country=country_enum,
search_term=search_term, search_term=search_term,
google_search_term=google_search_term,
location=location, location=location,
distance=distance, distance=distance,
is_remote=is_remote, is_remote=is_remote,
@@ -213,8 +218,8 @@ def scrape_jobs(
"title", "title",
"company", "company",
"location", "location",
"job_type",
"date_posted", "date_posted",
"job_type",
"salary_source", "salary_source",
"interval", "interval",
"min_amount", "min_amount",
@@ -223,12 +228,12 @@ def scrape_jobs(
"is_remote", "is_remote",
"job_level", "job_level",
"job_function", "job_function",
"company_industry",
"listing_type", "listing_type",
"emails", "emails",
"description", "description",
"company_industry",
"company_url", "company_url",
"logo_photo_url", "company_logo",
"company_url_direct", "company_url_direct",
"company_addresses", "company_addresses",
"company_num_employees", "company_num_employees",
@@ -245,6 +250,8 @@ def scrape_jobs(
jobs_df = jobs_df[desired_order] jobs_df = jobs_df[desired_order]
# Step 4: Sort the DataFrame as required # Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False]) return jobs_df.sort_values(
by=["site", "date_posted"], ascending=[True, False]
).reset_index(drop=True)
else: else:
return pd.DataFrame() return pd.DataFrame()

View File

@@ -256,7 +256,7 @@ class JobPost(BaseModel):
company_num_employees: str | None = None company_num_employees: str | None = None
company_revenue: str | None = None company_revenue: str | None = None
company_description: str | None = None company_description: str | None = None
logo_photo_url: str | None = None company_logo: str | None = None
banner_photo_url: str | None = None banner_photo_url: str | None = None
# linkedin only atm # linkedin only atm

View File

@@ -17,14 +17,18 @@ class Site(Enum):
INDEED = "indeed" INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter" ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor" GLASSDOOR = "glassdoor"
GOOGLE = "google"
class SalarySource(Enum): class SalarySource(Enum):
DIRECT_DATA = "direct_data" DIRECT_DATA = "direct_data"
DESCRIPTION = "description" DESCRIPTION = "description"
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
site_type: list[Site] site_type: list[Site]
search_term: str | None = None search_term: str | None = None
google_search_term: str | None = None
location: str | None = None location: str | None = None
country: Country | None = Country.USA country: Country | None = Country.USA
@@ -42,7 +46,9 @@ class ScraperInput(BaseModel):
class Scraper(ABC): class Scraper(ABC):
def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None): def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site self.site = site
self.proxies = proxies self.proxies = proxies
self.ca_cert = ca_cert self.ca_cert = ca_cert

View File

@@ -24,3 +24,8 @@ class ZipRecruiterException(Exception):
class GlassdoorException(Exception): class GlassdoorException(Exception):
def __init__(self, message=None): def __init__(self, message=None):
super().__init__(message or "An error occurred with Glassdoor") super().__init__(message or "An error occurred with Glassdoor")
class GoogleJobsException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Google Jobs")

View File

@@ -214,7 +214,7 @@ class GlassdoorScraper(Scraper):
is_remote=is_remote, is_remote=is_remote,
description=description, description=description,
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
logo_photo_url=company_logo, company_logo=company_logo,
listing_type=listing_type, listing_type=listing_type,
) )

View File

@@ -0,0 +1,250 @@
"""
jobspy.scrapers.google
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Google.
"""
from __future__ import annotations
import math
import re
import json
from typing import Tuple
from datetime import datetime, timedelta
from .constants import headers_jobs, headers_initial, async_param
from .. import Scraper, ScraperInput, Site
from ..utils import extract_emails_from_text, create_logger, extract_job_type
from ..utils import (
create_session,
)
from ...jobs import (
JobPost,
JobResponse,
Location,
JobType,
)
logger = create_logger("Google")
class GoogleJobsScraper(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes Google Scraper with the Goodle jobs search url
"""
site = Site(Site.GOOGLE)
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
self.country = None
self.session = None
self.scraper_input = None
self.jobs_per_page = 10
self.seen_urls = set()
self.url = "https://www.google.com/search"
self.jobs_url = "https://www.google.com/async/callback:550"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Google for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
if forward_cursor is None:
logger.warning(
"initial cursor not found, try changing your query or there was at most 10 results"
)
return JobResponse(jobs=job_list)
page = 1
while (
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
and forward_cursor
):
logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
try:
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
except Exception as e:
logger.error(f"failed to get jobs on page: {page}, {e}")
break
if not jobs:
logger.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
return JobResponse(
jobs=job_list[
scraper_input.offset : scraper_input.offset
+ scraper_input.results_wanted
]
)
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
"""Gets initial cursor and jobs to paginate through job listings"""
query = f"{self.scraper_input.search_term} jobs"
def get_time_range(hours_old):
if hours_old <= 24:
return "since yesterday"
elif hours_old <= 72:
return "in the last 3 days"
elif hours_old <= 168:
return "in the last week"
else:
return "in the last month"
job_type_mapping = {
JobType.FULL_TIME: "Full time",
JobType.PART_TIME: "Part time",
JobType.INTERNSHIP: "Internship",
JobType.CONTRACT: "Contract",
}
if self.scraper_input.job_type in job_type_mapping:
query += f" {job_type_mapping[self.scraper_input.job_type]}"
if self.scraper_input.location:
query += f" near {self.scraper_input.location}"
if self.scraper_input.hours_old:
time_filter = get_time_range(self.scraper_input.hours_old)
query += f" {time_filter}"
if self.scraper_input.is_remote:
query += " remote"
if self.scraper_input.google_search_term:
query = self.scraper_input.google_search_term
params = {"q": query, "udm": "8"}
response = self.session.get(self.url, headers=headers_initial, params=params)
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, response.text)
data_async_fc = match_fc.group(1) if match_fc else None
jobs_raw = self._find_job_info_initial_page(response.text)
jobs = []
for job_raw in jobs_raw:
job_post = self._parse_job(job_raw)
if job_post:
jobs.append(job_post)
return data_async_fc, jobs
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
response = self.session.get(self.jobs_url, headers=headers_jobs, params=params)
return self._parse_jobs(response.text)
def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
"""
Parses jobs on a page with next page cursor
"""
start_idx = job_data.find("[[[")
end_idx = job_data.rindex("]]]") + 3
s = job_data[start_idx:end_idx]
parsed = json.loads(s)[0]
pattern_fc = r'data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, job_data)
data_async_fc = match_fc.group(1) if match_fc else None
jobs_on_page = []
for array in parsed:
_, job_data = array
if not job_data.startswith("[[["):
continue
job_d = json.loads(job_data)
job_info = self._find_job_info(job_d)
job_post = self._parse_job(job_info)
if job_post:
jobs_on_page.append(job_post)
return jobs_on_page, data_async_fc
def _parse_job(self, job_info: list):
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
if job_url in self.seen_urls:
return
self.seen_urls.add(job_url)
title = job_info[0]
company_name = job_info[1]
location = city = job_info[2]
state = country = date_posted = None
if location and "," in location:
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
days_ago_str = job_info[12]
if type(days_ago_str) == str:
match = re.search(r"\d+", days_ago_str)
days_ago = int(match.group()) if match else None
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
description = job_info[19]
job_post = JobPost(
id=f"go-{job_info[28]}",
title=title,
company_name=company_name,
location=Location(
city=city, state=state, country=country[0] if country else None
),
job_url=job_url,
date_posted=date_posted,
is_remote="remote" in description.lower() or "wfh" in description.lower(),
description=description,
emails=extract_emails_from_text(description),
job_type=extract_job_type(description),
)
return job_post
@staticmethod
def _find_job_info(jobs_data: list | dict) -> list | None:
"""Iterates through the JSON data to find the job listings"""
if isinstance(jobs_data, dict):
for key, value in jobs_data.items():
if key == "520084652" and isinstance(value, list):
return value
else:
result = GoogleJobsScraper._find_job_info(value)
if result:
return result
elif isinstance(jobs_data, list):
for item in jobs_data:
result = GoogleJobsScraper._find_job_info(item)
if result:
return result
return None
@staticmethod
def _find_job_info_initial_page(html_text: str):
pattern = (
f'520084652":('
+ r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])"
)
results = []
matches = re.finditer(pattern, html_text)
import json
for match in matches:
try:
parsed_data = json.loads(match.group(1))
results.append(parsed_data)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)})
return results

View File

@@ -0,0 +1,52 @@
headers_initial = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"priority": "u=0, i",
"referer": "https://www.google.com/",
"sec-ch-prefers-color-scheme": "dark",
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
"sec-ch-ua-arch": '"arm"',
"sec-ch-ua-bitness": '"64"',
"sec-ch-ua-form-factors": '"Desktop"',
"sec-ch-ua-full-version": '"130.0.6723.58"',
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": '""',
"sec-ch-ua-platform": '"macOS"',
"sec-ch-ua-platform-version": '"15.0.1"',
"sec-ch-ua-wow64": "?0",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
"x-browser-channel": "stable",
"x-browser-copyright": "Copyright 2024 Google LLC. All rights reserved.",
"x-browser-year": "2024",
}
headers_jobs = {
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"priority": "u=1, i",
"referer": "https://www.google.com/",
"sec-ch-prefers-color-scheme": "dark",
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
"sec-ch-ua-arch": '"arm"',
"sec-ch-ua-bitness": '"64"',
"sec-ch-ua-form-factors": '"Desktop"',
"sec-ch-ua-full-version": '"130.0.6723.58"',
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": '""',
"sec-ch-ua-platform": '"macOS"',
"sec-ch-ua-platform-version": '"15.0.1"',
"sec-ch-ua-wow64": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
}
async_param = "_basejs:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/am=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAACAAAoICAAAAAAAKMAfAAAAIAQAAAAAAAAAAAAACCAAAEJDAAACAAAAAGABAIAAARBAAABAAAAAgAgQAABAASKAfv8JAAABAAAAAAwAQAQACQAAAAAAcAEAQABoCAAAABAAAIABAACAAAAEAAAAFAAAAAAAAAAAAAAAAAAAAAAAAACAQADoBwAAAAAAAAAAAAAQBAAAAATQAAoACOAHAAAAAAAAAQAAAIIAAAA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/dg=0/br=1/rs=ACT90oGxMeaFMCopIHq5tuQM-6_3M_VMjQ,_basecss:/xjs/_/ss/k=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAIAIAIAoEwCAADIC8AfsgEAawwAPkAAjgoAGAAAAAAAAEADAAAAAAIgAECHAAAAAAAAAAABAQAggAARQAAAQCEAAAAAIAAAABgAAAAAIAQIACCAAfB-AAFIQABoCEA_CgEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAAAAQEAAABAgAMCPAAA4AoE2BAEAggSAAIoAQAAAAAgAAAAACCAQAAAxEwA_ZAACAAAAAAAAAAkAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAQAEAAAAAAAAAAAAAAAAAAAAAQA/br=1/rs=ACT90oGZc36t3uUQkj0srnIvvbHjO2hgyg,_basecomb:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/ck=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAKAIAoIqEwCAADIK8AfsgEAawwAPkAAjgoAGAAACCAAAEJDAAACAAIgAGCHAIAAARBAAABBAQAggAgRQABAQSOAfv8JIAABABgAAAwAYAQICSCAAfB-cAFIQABoCEA_ChEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAACAQEDoBxAgAMCPAAA4AoE2BAEAggTQAIoASOAHAAgAAAAACSAQAIIxEwA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/d=1/ed=1/dg=0/br=1/ujg=1/rs=ACT90oFNLTjPzD_OAqhhtXwe2pg1T3WpBg,_fmt:prog,_id:fc_5FwaZ86OKsfdwN4P4La3yA4_2"

View File

@@ -69,17 +69,10 @@ class IndeedScraper(Scraper):
page = 1 page = 1
cursor = None cursor = None
offset_pages = math.ceil(self.scraper_input.offset / 100)
for _ in range(offset_pages):
logger.info(f"skipping search page: {page}")
__, cursor = self._scrape_page(cursor)
if not __:
logger.info(f"found no jobs on page: {page}")
break
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info( logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / 100)}" f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
) )
jobs, cursor = self._scrape_page(cursor) jobs, cursor = self._scrape_page(cursor)
if not jobs: if not jobs:
@@ -87,7 +80,12 @@ class IndeedScraper(Scraper):
break break
job_list += jobs job_list += jobs
page += 1 page += 1
return JobResponse(jobs=job_list[: scraper_input.results_wanted]) return JobResponse(
jobs=job_list[
scraper_input.offset : scraper_input.offset
+ scraper_input.results_wanted
]
)
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]: def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
""" """
@@ -260,7 +258,7 @@ class IndeedScraper(Scraper):
company_num_employees=employer_details.get("employeesLocalizedLabel"), company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"), company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"), company_description=employer_details.get("briefDescription"),
logo_photo_url=( company_logo=(
employer["images"].get("squareLogoUrl") employer["images"].get("squareLogoUrl")
if employer and employer.get("images") if employer and employer.get("images")
else None else None

View File

@@ -232,7 +232,7 @@ class LinkedInScraper(Scraper):
description=job_details.get("description"), description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"), job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")), emails=extract_emails_from_text(job_details.get("description")),
logo_photo_url=job_details.get("logo_photo_url"), company_logo=job_details.get("company_logo"),
job_function=job_details.get("job_function"), job_function=job_details.get("job_function"),
) )
@@ -275,7 +275,7 @@ class LinkedInScraper(Scraper):
if job_function_span: if job_function_span:
job_function = job_function_span.text.strip() job_function = job_function_span.text.strip()
logo_photo_url = ( company_logo = (
logo_image.get("data-delayed-url") logo_image.get("data-delayed-url")
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"})) if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
else None else None
@@ -286,7 +286,7 @@ class LinkedInScraper(Scraper):
"company_industry": self._parse_company_industry(soup), "company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup), "job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup), "job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": logo_photo_url, "company_logo": company_logo,
"job_function": job_function, "job_function": job_function,
} }

View File

@@ -264,3 +264,22 @@ def extract_salary(
else: else:
return interval, min_salary, max_salary, "USD" return interval, min_salary, max_salary, "USD"
return None, None, None, None return None, None, None, None
def extract_job_type(description: str):
if not description:
return []
keywords = {
JobType.FULL_TIME: r"full\s?time",
JobType.PART_TIME: r"part\s?time",
JobType.INTERNSHIP: r"internship",
JobType.CONTRACT: r"contract",
}
listing_types = []
for key, pattern in keywords.items():
if re.search(pattern, description, re.IGNORECASE):
listing_types.append(key)
return listing_types if listing_types else None

12
tests/test_google.py Normal file
View File

@@ -0,0 +1,12 @@
from jobspy import scrape_jobs
import pandas as pd
def test_google():
result = scrape_jobs(
site_name="google", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"