mirror of https://github.com/Bunsly/JobSpy
enh(linkedin): search by company ids (#99)
parent
02caf1b38d
commit
6ec7c24f7f
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.41"
|
||||
version = "1.1.42"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pandas as pd
|
||||
from typing import Tuple
|
||||
import concurrent.futures
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Tuple, Optional
|
||||
|
||||
from .jobs import JobType, Location
|
||||
from .scrapers.indeed import IndeedScraper
|
||||
|
@ -29,19 +29,20 @@ def _map_str_to_site(site_name: str) -> Site:
|
|||
|
||||
|
||||
def scrape_jobs(
|
||||
site_name: str | list[str] | Site | list[Site],
|
||||
search_term: str,
|
||||
location: str = "",
|
||||
distance: int = None,
|
||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||
search_term: str | None = None,
|
||||
location: str | None = None,
|
||||
distance: int | None = None,
|
||||
is_remote: bool = False,
|
||||
job_type: str = None,
|
||||
easy_apply: bool = False, # linkedin
|
||||
job_type: str | None = None,
|
||||
easy_apply: bool | None = None,
|
||||
results_wanted: int = 15,
|
||||
country_indeed: str = "usa",
|
||||
hyperlinks: bool = False,
|
||||
proxy: Optional[str] = None,
|
||||
full_description: Optional[bool] = False,
|
||||
offset: Optional[int] = 0,
|
||||
proxy: str | None = None,
|
||||
full_description: bool | None = False,
|
||||
linkedin_company_ids: list[int] | None = None,
|
||||
offset: int | None = 0,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Simultaneously scrapes job data from multiple job sites.
|
||||
|
@ -56,18 +57,23 @@ def scrape_jobs(
|
|||
|
||||
job_type = get_enum_from_value(job_type) if job_type else None
|
||||
|
||||
if type(site_name) == str:
|
||||
site_type = [_map_str_to_site(site_name)]
|
||||
else: #: if type(site_name) == list
|
||||
site_type = [
|
||||
_map_str_to_site(site) if type(site) == str else site_name
|
||||
def get_site_type():
|
||||
site_types = list(Site)
|
||||
if isinstance(site_name, str):
|
||||
site_types = [_map_str_to_site(site_name)]
|
||||
elif isinstance(site_name, Site):
|
||||
site_types = [site_name]
|
||||
elif isinstance(site_name, list):
|
||||
site_types = [
|
||||
_map_str_to_site(site) if isinstance(site, str) else site
|
||||
for site in site_name
|
||||
]
|
||||
return site_types
|
||||
|
||||
country_enum = Country.from_string(country_indeed)
|
||||
|
||||
scraper_input = ScraperInput(
|
||||
site_type=site_type,
|
||||
site_type=get_site_type(),
|
||||
country=country_enum,
|
||||
search_term=search_term,
|
||||
location=location,
|
||||
|
@ -77,6 +83,7 @@ def scrape_jobs(
|
|||
easy_apply=easy_apply,
|
||||
full_description=full_description,
|
||||
results_wanted=results_wanted,
|
||||
linkedin_company_ids=linkedin_company_ids,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country
|
||||
from typing import List, Optional, Any
|
||||
|
||||
|
||||
class Site(Enum):
|
||||
|
@ -10,23 +9,24 @@ class Site(Enum):
|
|||
|
||||
|
||||
class ScraperInput(BaseModel):
|
||||
site_type: List[Site]
|
||||
search_term: str
|
||||
site_type: list[Site]
|
||||
search_term: str | None = None
|
||||
|
||||
location: str = None
|
||||
country: Optional[Country] = Country.USA
|
||||
distance: Optional[int] = None
|
||||
location: str | None = None
|
||||
country: Country | None = Country.USA
|
||||
distance: int | None = None
|
||||
is_remote: bool = False
|
||||
job_type: Optional[JobType] = None
|
||||
easy_apply: bool = None # linkedin
|
||||
job_type: JobType | None = None
|
||||
easy_apply: bool | None = None
|
||||
full_description: bool = False
|
||||
offset: int = 0
|
||||
linkedin_company_ids: list[int] | None = None
|
||||
|
||||
results_wanted: int = 15
|
||||
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, site: Site, proxy: Optional[List[str]] = None):
|
||||
def __init__(self, site: Site, proxy: list[str] | None = None):
|
||||
self.site = site
|
||||
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
|
||||
|
||||
|
|
|
@ -348,7 +348,7 @@ class IndeedScraper(Scraper):
|
|||
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
|
||||
params = {
|
||||
"q": scraper_input.search_term,
|
||||
"l": scraper_input.location,
|
||||
"l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1],
|
||||
"filter": 0,
|
||||
"start": scraper_input.offset + page * 10,
|
||||
"sort": "date"
|
||||
|
|
|
@ -70,7 +70,9 @@ class LinkedInScraper(Scraper):
|
|||
|
||||
return mapping.get(job_type_enum, "")
|
||||
|
||||
while len(job_list) < scraper_input.results_wanted and page < 1000:
|
||||
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
||||
|
||||
while continue_search():
|
||||
session = create_session(is_tls=False, has_retry=True, delay=5)
|
||||
params = {
|
||||
"keywords": scraper_input.search_term,
|
||||
|
@ -83,6 +85,7 @@ class LinkedInScraper(Scraper):
|
|||
"pageNum": 0,
|
||||
"start": page + scraper_input.offset,
|
||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None
|
||||
}
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
|
@ -130,8 +133,9 @@ class LinkedInScraper(Scraper):
|
|||
except Exception as e:
|
||||
raise LinkedInException("Exception occurred while processing jobs")
|
||||
|
||||
page += 25
|
||||
if continue_search():
|
||||
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
|
||||
page += 25
|
||||
|
||||
job_list = job_list[: scraper_input.results_wanted]
|
||||
return JobResponse(jobs=job_list)
|
||||
|
|
|
@ -31,6 +31,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
|
||||
self.jobs_per_page = 20
|
||||
self.seen_urls = set()
|
||||
self.delay = 5
|
||||
|
||||
def find_jobs_in_page(
|
||||
self, scraper_input: ScraperInput, continue_token: str | None = None
|
||||
|
@ -59,7 +60,6 @@ class ZipRecruiterScraper(Scraper):
|
|||
raise ZipRecruiterException("bad proxy")
|
||||
raise ZipRecruiterException(str(e))
|
||||
|
||||
time.sleep(5)
|
||||
response_data = response.json()
|
||||
jobs_list = response_data.get("jobs", [])
|
||||
next_continue_token = response_data.get("continue", None)
|
||||
|
@ -85,6 +85,9 @@ class ZipRecruiterScraper(Scraper):
|
|||
if len(job_list) >= scraper_input.results_wanted:
|
||||
break
|
||||
|
||||
if page > 1:
|
||||
time.sleep(self.delay)
|
||||
|
||||
jobs_on_page, continue_token = self.find_jobs_in_page(
|
||||
scraper_input, continue_token
|
||||
)
|
||||
|
@ -108,7 +111,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
||||
description = modify_and_get_description(description_soup)
|
||||
|
||||
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
||||
company = job.get("hiring_company", {}).get("name")
|
||||
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
||||
country_enum = Country.from_string(country_value)
|
||||
|
||||
|
@ -184,6 +187,8 @@ class ZipRecruiterScraper(Scraper):
|
|||
if scraper_input.distance:
|
||||
params["radius"] = scraper_input.distance
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
|
||||
return params
|
||||
|
||||
@staticmethod
|
||||
|
|
Loading…
Reference in New Issue