Compare commits

...

9 Commits

Author SHA1 Message Date
Cullen Watson
757a94853e chore:version 2024-10-08 17:49:06 -05:00
Marcel Gozalbo Baró
6bc191d5c7 FEATURE: Add the "ca_cert" setting for providing a Certification Authority certificate in order to use proxies requiring it. (#204) 2024-10-08 17:46:46 -05:00
Cullen Watson
0cc34287f7 fix:turkey 2024-10-02 01:31:00 -05:00
Anton Pikhteryev
923979093b Add Malta for linkedin country support (#198) 2024-09-19 20:41:22 -05:00
Cullen Watson
286f0e4487 docs:readme 2024-09-18 18:49:41 -05:00
Cullen Watson
f7b29d43a2 fix(indeed):sort relevance not date (#197) 2024-09-18 18:42:25 -05:00
Cullen Watson
6f1490458c fix key error (#186) 2024-08-14 02:54:40 -05:00
Cullen Watson
6bb7d81ba8 change linkedin ep (#185) 2024-08-14 02:39:43 -05:00
Cullen Watson
0e046432d1 fix:variable bug (#181) 2024-08-05 12:47:55 -05:00
10 changed files with 40 additions and 22 deletions

View File

@@ -37,7 +37,7 @@ jobs = scrape_jobs(
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old) hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor country_indeed='USA', # only needed for indeed / glassdoor
# linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower) # linkedin_fetch_description=True # get more info such as full description, direct job url for linkedin (slower)
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"], # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
) )
@@ -79,6 +79,9 @@ Optional
├── proxies (list): ├── proxies (list):
| in format ['user:pass@host:port', 'localhost'] | in format ['user:pass@host:port', 'localhost']
| each job board scraper will round robin through the proxies | each job board scraper will round robin through the proxies
|
├── ca_cert (str)
| path to CA Certificate file for proxies
├── is_remote (bool) ├── is_remote (bool)
@@ -216,10 +219,8 @@ You can specify the following countries when searching on Indeed (use the exact
## Frequently Asked Questions ## Frequently Asked Questions
--- ---
**Q: Why is Indeed giving unrelated roles?**
**Q: Encountering issues with your queries?** **A:** Indeed is searching each one of your terms e.g. software intern, it searches software OR intern. Try search_term='"software intern"' in quotes for stricter searching
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
--- ---
@@ -230,3 +231,9 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
- Try using the proxies param to change your IP address. - Try using the proxies param to change your IP address.
--- ---
**Q: Encountering issues with your queries?**
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
---

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.62" version = "1.1.69"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -31,6 +31,7 @@ def scrape_jobs(
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxies: list[str] | str | None = None, proxies: list[str] | str | None = None,
ca_cert: str | None = None,
description_format: str = "markdown", description_format: str = "markdown",
linkedin_fetch_description: bool | None = False, linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None, linkedin_company_ids: list[int] | None = None,
@@ -97,7 +98,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies) scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name

View File

@@ -93,6 +93,7 @@ class Country(Enum):
KUWAIT = ("kuwait", "kw") KUWAIT = ("kuwait", "kw")
LUXEMBOURG = ("luxembourg", "lu") LUXEMBOURG = ("luxembourg", "lu")
MALAYSIA = ("malaysia", "malaysia:my", "com") MALAYSIA = ("malaysia", "malaysia:my", "com")
MALTA = ("malta", "malta:mt", "mt")
MEXICO = ("mexico", "mx", "com.mx") MEXICO = ("mexico", "mx", "com.mx")
MOROCCO = ("morocco", "ma") MOROCCO = ("morocco", "ma")
NETHERLANDS = ("netherlands", "nl", "nl") NETHERLANDS = ("netherlands", "nl", "nl")
@@ -117,7 +118,7 @@ class Country(Enum):
SWITZERLAND = ("switzerland", "ch", "de:ch") SWITZERLAND = ("switzerland", "ch", "de:ch")
TAIWAN = ("taiwan", "tw") TAIWAN = ("taiwan", "tw")
THAILAND = ("thailand", "th") THAILAND = ("thailand", "th")
TURKEY = ("turkey", "tr") TURKEY = ("türkiye,turkey", "tr")
UKRAINE = ("ukraine", "ua") UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae") UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ("uk,united kingdom", "uk:gb", "co.uk") UK = ("uk,united kingdom", "uk:gb", "co.uk")

View File

@@ -42,9 +42,10 @@ class ScraperInput(BaseModel):
class Scraper(ABC): class Scraper(ABC):
def __init__(self, site: Site, proxies: list[str] | None = None): def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None):
self.proxies = proxies
self.site = site self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod @abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@@ -34,12 +34,12 @@ from ...jobs import (
class GlassdoorScraper(Scraper): class GlassdoorScraper(Scraper):
def __init__(self, proxies: list[str] | str | None = None): def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
""" """
Initializes GlassdoorScraper with the Glassdoor job search url Initializes GlassdoorScraper with the Glassdoor job search url
""" """
site = Site(Site.GLASSDOOR) site = Site(Site.GLASSDOOR)
super().__init__(site, proxies=proxies) super().__init__(site, proxies=proxies, ca_cert=ca_cert)
self.base_url = None self.base_url = None
self.country = None self.country = None
@@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper):
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url() self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True) self.session = create_session(proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True)
token = self._get_csrf_token() token = self._get_csrf_token()
self.headers["gd-csrf-token"] = token if token else self.fallback_token self.headers["gd-csrf-token"] = token if token else self.fallback_token

View File

@@ -32,13 +32,13 @@ from ...jobs import (
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
def __init__(self, proxies: list[str] | str | None = None): def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
""" """
Initializes IndeedScraper with the Indeed API url Initializes IndeedScraper with the Indeed API url
""" """
super().__init__(Site.INDEED, proxies=proxies) super().__init__(Site.INDEED, proxies=proxies)
self.session = create_session(proxies=self.proxies, is_tls=False) self.session = create_session(proxies=self.proxies, ca_cert=ca_cert, is_tls=False)
self.scraper_input = None self.scraper_input = None
self.jobs_per_page = 100 self.jobs_per_page = 100
self.num_workers = 10 self.num_workers = 10
@@ -364,8 +364,8 @@ class IndeedScraper(Scraper):
{what} {what}
{location} {location}
limit: 100 limit: 100
sort: DATE
{cursor} {cursor}
sort: RELEVANCE
{filters} {filters}
) {{ ) {{
pageInfo {{ pageInfo {{

View File

@@ -44,13 +44,14 @@ class LinkedInScraper(Scraper):
band_delay = 4 band_delay = 4
jobs_per_page = 25 jobs_per_page = 25
def __init__(self, proxies: list[str] | str | None = None): def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
super().__init__(Site.LINKEDIN, proxies=proxies) super().__init__(Site.LINKEDIN, proxies=proxies, ca_cert=ca_cert)
self.session = create_session( self.session = create_session(
proxies=self.proxies, proxies=self.proxies,
ca_cert=ca_cert,
is_tls=False, is_tls=False,
has_retry=True, has_retry=True,
delay=5, delay=5,
@@ -236,7 +237,7 @@ class LinkedInScraper(Scraper):
""" """
try: try:
response = self.session.get( response = self.session.get(
f"{self.base_url}/jobs-guest/jobs/api/jobPosting/{job_id}", timeout=5 f"{self.base_url}/jobs/view/{job_id}", timeout=5
) )
response.raise_for_status() response.raise_for_status()
except: except:

View File

@@ -100,6 +100,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
def create_session( def create_session(
*, *,
proxies: dict | str | None = None, proxies: dict | str | None = None,
ca_cert: str | None = None,
is_tls: bool = True, is_tls: bool = True,
has_retry: bool = False, has_retry: bool = False,
delay: int = 1, delay: int = 1,
@@ -119,6 +120,9 @@ def create_session(
clear_cookies=clear_cookies, clear_cookies=clear_cookies,
) )
if ca_cert:
session.verify = ca_cert
return session return session
@@ -198,6 +202,7 @@ def extract_salary(
if not salary_str: if not salary_str:
return None, None, None, None return None, None, None, None
annual_max_salary = None
min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)" min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
def to_int(s): def to_int(s):
@@ -238,6 +243,8 @@ def extract_salary(
annual_max_salary = max_salary annual_max_salary = max_salary
# Ensure salary range is within specified limits # Ensure salary range is within specified limits
if not annual_max_salary:
return None, None, None, None
if ( if (
lower_limit <= annual_min_salary <= upper_limit lower_limit <= annual_min_salary <= upper_limit
and lower_limit <= annual_max_salary <= upper_limit and lower_limit <= annual_max_salary <= upper_limit

View File

@@ -41,14 +41,14 @@ class ZipRecruiterScraper(Scraper):
base_url = "https://www.ziprecruiter.com" base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com" api_url = "https://api.ziprecruiter.com"
def __init__(self, proxies: list[str] | str | None = None): def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url
""" """
super().__init__(Site.ZIP_RECRUITER, proxies=proxies) super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
self.scraper_input = None self.scraper_input = None
self.session = create_session(proxies=proxies) self.session = create_session(proxies=proxies, ca_cert=ca_cert)
self._get_cookies() self._get_cookies()
self.delay = 5 self.delay = 5
@@ -200,7 +200,7 @@ class ZipRecruiterScraper(Scraper):
script_tag = soup.find("script", type="application/json") script_tag = soup.find("script", type="application/json")
if script_tag: if script_tag:
job_json = json.loads(script_tag.string) job_json = json.loads(script_tag.string)
job_url_val = job_json["model"]["saveJobURL"] job_url_val = job_json["model"].get("saveJobURL", "")
m = re.search(r"job_url=(.+)", job_url_val) m = re.search(r"job_url=(.+)", job_url_val)
if m: if m:
job_url_direct = m.group(1) job_url_direct = m.group(1)