enh(linkedin): search by company ids (#99)

pull/104/head
Cullen Watson 2024-02-04 09:21:45 -06:00 committed by GitHub
parent 02caf1b38d
commit 6ec7c24f7f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 50 additions and 34 deletions

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.41" version = "1.1.42"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -1,7 +1,7 @@
import pandas as pd import pandas as pd
from typing import Tuple
import concurrent.futures import concurrent.futures
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import Tuple, Optional
from .jobs import JobType, Location from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
@ -29,19 +29,20 @@ def _map_str_to_site(site_name: str) -> Site:
def scrape_jobs( def scrape_jobs(
site_name: str | list[str] | Site | list[Site], site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str, search_term: str | None = None,
location: str = "", location: str | None = None,
distance: int = None, distance: int | None = None,
is_remote: bool = False, is_remote: bool = False,
job_type: str = None, job_type: str | None = None,
easy_apply: bool = False, # linkedin easy_apply: bool | None = None,
results_wanted: int = 15, results_wanted: int = 15,
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxy: Optional[str] = None, proxy: str | None = None,
full_description: Optional[bool] = False, full_description: bool | None = False,
offset: Optional[int] = 0, linkedin_company_ids: list[int] | None = None,
offset: int | None = 0,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Simultaneously scrapes job data from multiple job sites. Simultaneously scrapes job data from multiple job sites.
@ -56,18 +57,23 @@ def scrape_jobs(
job_type = get_enum_from_value(job_type) if job_type else None job_type = get_enum_from_value(job_type) if job_type else None
if type(site_name) == str: def get_site_type():
site_type = [_map_str_to_site(site_name)] site_types = list(Site)
else: #: if type(site_name) == list if isinstance(site_name, str):
site_type = [ site_types = [_map_str_to_site(site_name)]
_map_str_to_site(site) if type(site) == str else site_name elif isinstance(site_name, Site):
for site in site_name site_types = [site_name]
] elif isinstance(site_name, list):
site_types = [
_map_str_to_site(site) if isinstance(site, str) else site
for site in site_name
]
return site_types
country_enum = Country.from_string(country_indeed) country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput( scraper_input = ScraperInput(
site_type=site_type, site_type=get_site_type(),
country=country_enum, country=country_enum,
search_term=search_term, search_term=search_term,
location=location, location=location,
@ -77,6 +83,7 @@ def scrape_jobs(
easy_apply=easy_apply, easy_apply=easy_apply,
full_description=full_description, full_description=full_description,
results_wanted=results_wanted, results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids,
offset=offset, offset=offset,
) )

View File

@ -1,5 +1,4 @@
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country from ..jobs import Enum, BaseModel, JobType, JobResponse, Country
from typing import List, Optional, Any
class Site(Enum): class Site(Enum):
@ -10,23 +9,24 @@ class Site(Enum):
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
site_type: List[Site] site_type: list[Site]
search_term: str search_term: str | None = None
location: str = None location: str | None = None
country: Optional[Country] = Country.USA country: Country | None = Country.USA
distance: Optional[int] = None distance: int | None = None
is_remote: bool = False is_remote: bool = False
job_type: Optional[JobType] = None job_type: JobType | None = None
easy_apply: bool = None # linkedin easy_apply: bool | None = None
full_description: bool = False full_description: bool = False
offset: int = 0 offset: int = 0
linkedin_company_ids: list[int] | None = None
results_wanted: int = 15 results_wanted: int = 15
class Scraper: class Scraper:
def __init__(self, site: Site, proxy: Optional[List[str]] = None): def __init__(self, site: Site, proxy: list[str] | None = None):
self.site = site self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy) self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)

View File

@ -348,7 +348,7 @@ class IndeedScraper(Scraper):
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]: def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
"l": scraper_input.location, "l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1],
"filter": 0, "filter": 0,
"start": scraper_input.offset + page * 10, "start": scraper_input.offset + page * 10,
"sort": "date" "sort": "date"

View File

@ -70,7 +70,9 @@ class LinkedInScraper(Scraper):
return mapping.get(job_type_enum, "") return mapping.get(job_type_enum, "")
while len(job_list) < scraper_input.results_wanted and page < 1000: continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
while continue_search():
session = create_session(is_tls=False, has_retry=True, delay=5) session = create_session(is_tls=False, has_retry=True, delay=5)
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
@ -83,6 +85,7 @@ class LinkedInScraper(Scraper):
"pageNum": 0, "pageNum": 0,
"start": page + scraper_input.offset, "start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None
} }
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
@ -130,8 +133,9 @@ class LinkedInScraper(Scraper):
except Exception as e: except Exception as e:
raise LinkedInException("Exception occurred while processing jobs") raise LinkedInException("Exception occurred while processing jobs")
page += 25 if continue_search():
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2)) time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
page += 25
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)

View File

@ -31,6 +31,7 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
self.delay = 5
def find_jobs_in_page( def find_jobs_in_page(
self, scraper_input: ScraperInput, continue_token: str | None = None self, scraper_input: ScraperInput, continue_token: str | None = None
@ -59,7 +60,6 @@ class ZipRecruiterScraper(Scraper):
raise ZipRecruiterException("bad proxy") raise ZipRecruiterException("bad proxy")
raise ZipRecruiterException(str(e)) raise ZipRecruiterException(str(e))
time.sleep(5)
response_data = response.json() response_data = response.json()
jobs_list = response_data.get("jobs", []) jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get("continue", None) next_continue_token = response_data.get("continue", None)
@ -85,6 +85,9 @@ class ZipRecruiterScraper(Scraper):
if len(job_list) >= scraper_input.results_wanted: if len(job_list) >= scraper_input.results_wanted:
break break
if page > 1:
time.sleep(self.delay)
jobs_on_page, continue_token = self.find_jobs_in_page( jobs_on_page, continue_token = self.find_jobs_in_page(
scraper_input, continue_token scraper_input, continue_token
) )
@ -108,7 +111,7 @@ class ZipRecruiterScraper(Scraper):
description_soup = BeautifulSoup(job_description_html, "html.parser") description_soup = BeautifulSoup(job_description_html, "html.parser")
description = modify_and_get_description(description_soup) description = modify_and_get_description(description_soup)
company = job["hiring_company"].get("name") if "hiring_company" in job else None company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada" country_value = "usa" if job.get("job_country") == "US" else "canada"
country_enum = Country.from_string(country_value) country_enum = Country.from_string(country_value)
@ -184,6 +187,8 @@ class ZipRecruiterScraper(Scraper):
if scraper_input.distance: if scraper_input.distance:
params["radius"] = scraper_input.distance params["radius"] = scraper_input.distance
params = {k: v for k, v in params.items() if v is not None}
return params return params
@staticmethod @staticmethod