mirror of https://github.com/Bunsly/JobSpy
enh(linkedin): search by company ids (#99)
parent
02caf1b38d
commit
6ec7c24f7f
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.41"
|
version = "1.1.42"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from typing import Tuple
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from typing import Tuple, Optional
|
|
||||||
|
|
||||||
from .jobs import JobType, Location
|
from .jobs import JobType, Location
|
||||||
from .scrapers.indeed import IndeedScraper
|
from .scrapers.indeed import IndeedScraper
|
||||||
|
@ -29,19 +29,20 @@ def _map_str_to_site(site_name: str) -> Site:
|
||||||
|
|
||||||
|
|
||||||
def scrape_jobs(
|
def scrape_jobs(
|
||||||
site_name: str | list[str] | Site | list[Site],
|
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||||
search_term: str,
|
search_term: str | None = None,
|
||||||
location: str = "",
|
location: str | None = None,
|
||||||
distance: int = None,
|
distance: int | None = None,
|
||||||
is_remote: bool = False,
|
is_remote: bool = False,
|
||||||
job_type: str = None,
|
job_type: str | None = None,
|
||||||
easy_apply: bool = False, # linkedin
|
easy_apply: bool | None = None,
|
||||||
results_wanted: int = 15,
|
results_wanted: int = 15,
|
||||||
country_indeed: str = "usa",
|
country_indeed: str = "usa",
|
||||||
hyperlinks: bool = False,
|
hyperlinks: bool = False,
|
||||||
proxy: Optional[str] = None,
|
proxy: str | None = None,
|
||||||
full_description: Optional[bool] = False,
|
full_description: bool | None = False,
|
||||||
offset: Optional[int] = 0,
|
linkedin_company_ids: list[int] | None = None,
|
||||||
|
offset: int | None = 0,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Simultaneously scrapes job data from multiple job sites.
|
Simultaneously scrapes job data from multiple job sites.
|
||||||
|
@ -56,18 +57,23 @@ def scrape_jobs(
|
||||||
|
|
||||||
job_type = get_enum_from_value(job_type) if job_type else None
|
job_type = get_enum_from_value(job_type) if job_type else None
|
||||||
|
|
||||||
if type(site_name) == str:
|
def get_site_type():
|
||||||
site_type = [_map_str_to_site(site_name)]
|
site_types = list(Site)
|
||||||
else: #: if type(site_name) == list
|
if isinstance(site_name, str):
|
||||||
site_type = [
|
site_types = [_map_str_to_site(site_name)]
|
||||||
_map_str_to_site(site) if type(site) == str else site_name
|
elif isinstance(site_name, Site):
|
||||||
|
site_types = [site_name]
|
||||||
|
elif isinstance(site_name, list):
|
||||||
|
site_types = [
|
||||||
|
_map_str_to_site(site) if isinstance(site, str) else site
|
||||||
for site in site_name
|
for site in site_name
|
||||||
]
|
]
|
||||||
|
return site_types
|
||||||
|
|
||||||
country_enum = Country.from_string(country_indeed)
|
country_enum = Country.from_string(country_indeed)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
site_type=site_type,
|
site_type=get_site_type(),
|
||||||
country=country_enum,
|
country=country_enum,
|
||||||
search_term=search_term,
|
search_term=search_term,
|
||||||
location=location,
|
location=location,
|
||||||
|
@ -77,6 +83,7 @@ def scrape_jobs(
|
||||||
easy_apply=easy_apply,
|
easy_apply=easy_apply,
|
||||||
full_description=full_description,
|
full_description=full_description,
|
||||||
results_wanted=results_wanted,
|
results_wanted=results_wanted,
|
||||||
|
linkedin_company_ids=linkedin_company_ids,
|
||||||
offset=offset,
|
offset=offset,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country
|
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country
|
||||||
from typing import List, Optional, Any
|
|
||||||
|
|
||||||
|
|
||||||
class Site(Enum):
|
class Site(Enum):
|
||||||
|
@ -10,23 +9,24 @@ class Site(Enum):
|
||||||
|
|
||||||
|
|
||||||
class ScraperInput(BaseModel):
|
class ScraperInput(BaseModel):
|
||||||
site_type: List[Site]
|
site_type: list[Site]
|
||||||
search_term: str
|
search_term: str | None = None
|
||||||
|
|
||||||
location: str = None
|
location: str | None = None
|
||||||
country: Optional[Country] = Country.USA
|
country: Country | None = Country.USA
|
||||||
distance: Optional[int] = None
|
distance: int | None = None
|
||||||
is_remote: bool = False
|
is_remote: bool = False
|
||||||
job_type: Optional[JobType] = None
|
job_type: JobType | None = None
|
||||||
easy_apply: bool = None # linkedin
|
easy_apply: bool | None = None
|
||||||
full_description: bool = False
|
full_description: bool = False
|
||||||
offset: int = 0
|
offset: int = 0
|
||||||
|
linkedin_company_ids: list[int] | None = None
|
||||||
|
|
||||||
results_wanted: int = 15
|
results_wanted: int = 15
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self, site: Site, proxy: Optional[List[str]] = None):
|
def __init__(self, site: Site, proxy: list[str] | None = None):
|
||||||
self.site = site
|
self.site = site
|
||||||
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
|
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
|
||||||
|
|
||||||
|
|
|
@ -348,7 +348,7 @@ class IndeedScraper(Scraper):
|
||||||
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
|
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
|
||||||
params = {
|
params = {
|
||||||
"q": scraper_input.search_term,
|
"q": scraper_input.search_term,
|
||||||
"l": scraper_input.location,
|
"l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1],
|
||||||
"filter": 0,
|
"filter": 0,
|
||||||
"start": scraper_input.offset + page * 10,
|
"start": scraper_input.offset + page * 10,
|
||||||
"sort": "date"
|
"sort": "date"
|
||||||
|
|
|
@ -70,7 +70,9 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
return mapping.get(job_type_enum, "")
|
return mapping.get(job_type_enum, "")
|
||||||
|
|
||||||
while len(job_list) < scraper_input.results_wanted and page < 1000:
|
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
||||||
|
|
||||||
|
while continue_search():
|
||||||
session = create_session(is_tls=False, has_retry=True, delay=5)
|
session = create_session(is_tls=False, has_retry=True, delay=5)
|
||||||
params = {
|
params = {
|
||||||
"keywords": scraper_input.search_term,
|
"keywords": scraper_input.search_term,
|
||||||
|
@ -83,6 +85,7 @@ class LinkedInScraper(Scraper):
|
||||||
"pageNum": 0,
|
"pageNum": 0,
|
||||||
"start": page + scraper_input.offset,
|
"start": page + scraper_input.offset,
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
|
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None
|
||||||
}
|
}
|
||||||
|
|
||||||
params = {k: v for k, v in params.items() if v is not None}
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
|
@ -130,8 +133,9 @@ class LinkedInScraper(Scraper):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise LinkedInException("Exception occurred while processing jobs")
|
raise LinkedInException("Exception occurred while processing jobs")
|
||||||
|
|
||||||
page += 25
|
if continue_search():
|
||||||
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
|
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
|
||||||
|
page += 25
|
||||||
|
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
|
@ -31,6 +31,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
|
|
||||||
self.jobs_per_page = 20
|
self.jobs_per_page = 20
|
||||||
self.seen_urls = set()
|
self.seen_urls = set()
|
||||||
|
self.delay = 5
|
||||||
|
|
||||||
def find_jobs_in_page(
|
def find_jobs_in_page(
|
||||||
self, scraper_input: ScraperInput, continue_token: str | None = None
|
self, scraper_input: ScraperInput, continue_token: str | None = None
|
||||||
|
@ -59,7 +60,6 @@ class ZipRecruiterScraper(Scraper):
|
||||||
raise ZipRecruiterException("bad proxy")
|
raise ZipRecruiterException("bad proxy")
|
||||||
raise ZipRecruiterException(str(e))
|
raise ZipRecruiterException(str(e))
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
response_data = response.json()
|
response_data = response.json()
|
||||||
jobs_list = response_data.get("jobs", [])
|
jobs_list = response_data.get("jobs", [])
|
||||||
next_continue_token = response_data.get("continue", None)
|
next_continue_token = response_data.get("continue", None)
|
||||||
|
@ -85,6 +85,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
if len(job_list) >= scraper_input.results_wanted:
|
if len(job_list) >= scraper_input.results_wanted:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if page > 1:
|
||||||
|
time.sleep(self.delay)
|
||||||
|
|
||||||
jobs_on_page, continue_token = self.find_jobs_in_page(
|
jobs_on_page, continue_token = self.find_jobs_in_page(
|
||||||
scraper_input, continue_token
|
scraper_input, continue_token
|
||||||
)
|
)
|
||||||
|
@ -108,7 +111,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
||||||
description = modify_and_get_description(description_soup)
|
description = modify_and_get_description(description_soup)
|
||||||
|
|
||||||
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
company = job.get("hiring_company", {}).get("name")
|
||||||
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
||||||
country_enum = Country.from_string(country_value)
|
country_enum = Country.from_string(country_value)
|
||||||
|
|
||||||
|
@ -184,6 +187,8 @@ class ZipRecruiterScraper(Scraper):
|
||||||
if scraper_input.distance:
|
if scraper_input.distance:
|
||||||
params["radius"] = scraper_input.distance
|
params["radius"] = scraper_input.distance
|
||||||
|
|
||||||
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
Loading…
Reference in New Issue