From 6ec7c24f7f6fc372c5a3240ab3280952ddebae72 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 4 Feb 2024 09:21:45 -0600 Subject: [PATCH] enh(linkedin): search by company ids (#99) --- pyproject.toml | 2 +- src/jobspy/__init__.py | 43 ++++++++++++-------- src/jobspy/scrapers/__init__.py | 18 ++++---- src/jobspy/scrapers/indeed/__init__.py | 2 +- src/jobspy/scrapers/linkedin/__init__.py | 10 +++-- src/jobspy/scrapers/ziprecruiter/__init__.py | 9 +++- 6 files changed, 50 insertions(+), 34 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c816a16..8e8461d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.41" +version = "1.1.42" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 1a4f66d..cf0222b 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -1,7 +1,7 @@ import pandas as pd +from typing import Tuple import concurrent.futures from concurrent.futures import ThreadPoolExecutor -from typing import Tuple, Optional from .jobs import JobType, Location from .scrapers.indeed import IndeedScraper @@ -29,19 +29,20 @@ def _map_str_to_site(site_name: str) -> Site: def scrape_jobs( - site_name: str | list[str] | Site | list[Site], - search_term: str, - location: str = "", - distance: int = None, + site_name: str | list[str] | Site | list[Site] | None = None, + search_term: str | None = None, + location: str | None = None, + distance: int | None = None, is_remote: bool = False, - job_type: str = None, - easy_apply: bool = False, # linkedin + job_type: str | None = None, + easy_apply: bool | None = None, results_wanted: int = 15, country_indeed: str = "usa", hyperlinks: bool = False, - proxy: Optional[str] = None, - full_description: Optional[bool] = False, - offset: Optional[int] = 0, + proxy: str | None = None, + full_description: bool | None = False, + linkedin_company_ids: list[int] | None = None, + offset: int | None = 0, ) -> pd.DataFrame: """ Simultaneously scrapes job data from multiple job sites. @@ -56,18 +57,23 @@ def scrape_jobs( job_type = get_enum_from_value(job_type) if job_type else None - if type(site_name) == str: - site_type = [_map_str_to_site(site_name)] - else: #: if type(site_name) == list - site_type = [ - _map_str_to_site(site) if type(site) == str else site_name - for site in site_name - ] + def get_site_type(): + site_types = list(Site) + if isinstance(site_name, str): + site_types = [_map_str_to_site(site_name)] + elif isinstance(site_name, Site): + site_types = [site_name] + elif isinstance(site_name, list): + site_types = [ + _map_str_to_site(site) if isinstance(site, str) else site + for site in site_name + ] + return site_types country_enum = Country.from_string(country_indeed) scraper_input = ScraperInput( - site_type=site_type, + site_type=get_site_type(), country=country_enum, search_term=search_term, location=location, @@ -77,6 +83,7 @@ def scrape_jobs( easy_apply=easy_apply, full_description=full_description, results_wanted=results_wanted, + linkedin_company_ids=linkedin_company_ids, offset=offset, ) diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 37bd356..fc8c633 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -1,5 +1,4 @@ from ..jobs import Enum, BaseModel, JobType, JobResponse, Country -from typing import List, Optional, Any class Site(Enum): @@ -10,23 +9,24 @@ class Site(Enum): class ScraperInput(BaseModel): - site_type: List[Site] - search_term: str + site_type: list[Site] + search_term: str | None = None - location: str = None - country: Optional[Country] = Country.USA - distance: Optional[int] = None + location: str | None = None + country: Country | None = Country.USA + distance: int | None = None is_remote: bool = False - job_type: Optional[JobType] = None - easy_apply: bool = None # linkedin + job_type: JobType | None = None + easy_apply: bool | None = None full_description: bool = False offset: int = 0 + linkedin_company_ids: list[int] | None = None results_wanted: int = 15 class Scraper: - def __init__(self, site: Site, proxy: Optional[List[str]] = None): + def __init__(self, site: Site, proxy: list[str] | None = None): self.site = site self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy) diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 06a8752..9e21e70 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -348,7 +348,7 @@ class IndeedScraper(Scraper): def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]: params = { "q": scraper_input.search_term, - "l": scraper_input.location, + "l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1], "filter": 0, "start": scraper_input.offset + page * 10, "sort": "date" diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index dcdac96..19b6173 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -70,7 +70,9 @@ class LinkedInScraper(Scraper): return mapping.get(job_type_enum, "") - while len(job_list) < scraper_input.results_wanted and page < 1000: + continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000 + + while continue_search(): session = create_session(is_tls=False, has_retry=True, delay=5) params = { "keywords": scraper_input.search_term, @@ -83,6 +85,7 @@ class LinkedInScraper(Scraper): "pageNum": 0, "start": page + scraper_input.offset, "f_AL": "true" if scraper_input.easy_apply else None, + "f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None } params = {k: v for k, v in params.items() if v is not None} @@ -130,8 +133,9 @@ class LinkedInScraper(Scraper): except Exception as e: raise LinkedInException("Exception occurred while processing jobs") - page += 25 - time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2)) + if continue_search(): + time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2)) + page += 25 job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 2b07f33..adf7207 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -31,6 +31,7 @@ class ZipRecruiterScraper(Scraper): self.jobs_per_page = 20 self.seen_urls = set() + self.delay = 5 def find_jobs_in_page( self, scraper_input: ScraperInput, continue_token: str | None = None @@ -59,7 +60,6 @@ class ZipRecruiterScraper(Scraper): raise ZipRecruiterException("bad proxy") raise ZipRecruiterException(str(e)) - time.sleep(5) response_data = response.json() jobs_list = response_data.get("jobs", []) next_continue_token = response_data.get("continue", None) @@ -85,6 +85,9 @@ class ZipRecruiterScraper(Scraper): if len(job_list) >= scraper_input.results_wanted: break + if page > 1: + time.sleep(self.delay) + jobs_on_page, continue_token = self.find_jobs_in_page( scraper_input, continue_token ) @@ -108,7 +111,7 @@ class ZipRecruiterScraper(Scraper): description_soup = BeautifulSoup(job_description_html, "html.parser") description = modify_and_get_description(description_soup) - company = job["hiring_company"].get("name") if "hiring_company" in job else None + company = job.get("hiring_company", {}).get("name") country_value = "usa" if job.get("job_country") == "US" else "canada" country_enum = Country.from_string(country_value) @@ -184,6 +187,8 @@ class ZipRecruiterScraper(Scraper): if scraper_input.distance: params["radius"] = scraper_input.distance + params = {k: v for k, v in params.items() if v is not None} + return params @staticmethod