diff --git a/README.md b/README.md
index 90123e3..6347c71 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ work with us.*
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame
-- Proxy support
+- Proxies support
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
Updated for release v1.1.3
@@ -39,7 +39,10 @@ jobs = scrape_jobs(
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor
+
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
+ # proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],
+
)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
@@ -76,8 +79,9 @@ Optional
├── job_type (str):
| fulltime, parttime, internship, contract
│
-├── proxy (str):
-| in format 'http://user:pass@host:port'
+├── proxies ():
+| in format ['user:pass@host:port', 'localhost']
+| each job board will round robin through the proxies
│
├── is_remote (bool)
│
@@ -201,7 +205,7 @@ You can specify the following countries when searching on Indeed (use the exact
## Notes
* Indeed is the best scraper currently with no rate limiting.
* All the job board endpoints are capped at around 1000 jobs on a given search.
-* LinkedIn is the most restrictive and usually rate limits around the 10th page.
+* LinkedIn is the most restrictive and usually rate limits around the 10th page with one ip. Proxies are a must basically.
## Frequently Asked Questions
@@ -216,7 +220,7 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
**Q: Received a response code 429?**
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
-- Waiting some time between scrapes (site-dependent).
-- Trying a VPN or proxy to change your IP address.
+- Wait some time between scrapes (site-dependent).
+- Try using the proxies param to change your IP address.
---
diff --git a/examples/JobSpy_AllSites.py b/examples/JobSpy_AllSites.py
deleted file mode 100644
index ad43c29..0000000
--- a/examples/JobSpy_AllSites.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from jobspy import scrape_jobs
-import pandas as pd
-
-jobs: pd.DataFrame = scrape_jobs(
- site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
- search_term="software engineer",
- location="Dallas, TX",
- results_wanted=25, # be wary the higher it is, the more likey you'll get blocked (rotating proxy can help tho)
- country_indeed="USA",
- # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
-)
-
-# formatting for pandas
-pd.set_option("display.max_columns", None)
-pd.set_option("display.max_rows", None)
-pd.set_option("display.width", None)
-pd.set_option("display.max_colwidth", 50) # set to 0 to see full job url / desc
-
-# 1: output to console
-print(jobs)
-
-# 2: output to .csv
-jobs.to_csv("./jobs.csv", index=False)
-print("outputted to jobs.csv")
-
-# 3: output to .xlsx
-# jobs.to_xlsx('jobs.xlsx', index=False)
-
-# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
-# display(jobs)
diff --git a/examples/JobSpy_Demo.ipynb b/examples/JobSpy_Demo.ipynb
deleted file mode 100644
index 6c182f3..0000000
--- a/examples/JobSpy_Demo.ipynb
+++ /dev/null
@@ -1,167 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "00a94b47-f47b-420f-ba7e-714ef219c006",
- "metadata": {},
- "outputs": [],
- "source": [
- "from jobspy import scrape_jobs\n",
- "import pandas as pd\n",
- "from IPython.display import display, HTML"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9f773e6c-d9fc-42cc-b0ef-63b739e78435",
- "metadata": {},
- "outputs": [],
- "source": [
- "pd.set_option('display.max_columns', None)\n",
- "pd.set_option('display.max_rows', None)\n",
- "pd.set_option('display.width', None)\n",
- "pd.set_option('display.max_colwidth', 50)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1253c1f8-9437-492e-9dd3-e7fe51099420",
- "metadata": {},
- "outputs": [],
- "source": [
- "# example 1 (no hyperlinks, USA)\n",
- "jobs = scrape_jobs(\n",
- " site_name=[\"linkedin\"],\n",
- " location='san francisco',\n",
- " search_term=\"engineer\",\n",
- " results_wanted=5,\n",
- "\n",
- " # use if you want to use a proxy\n",
- " # proxy=\"socks5://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
- " proxy=\"http://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
- " #proxy=\"https://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
- ")\n",
- "display(jobs)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6a581b2d-f7da-4fac-868d-9efe143ee20a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# example 2 - remote USA & hyperlinks\n",
- "jobs = scrape_jobs(\n",
- " site_name=[\"linkedin\", \"zip_recruiter\", \"indeed\"],\n",
- " # location='san francisco',\n",
- " search_term=\"software engineer\",\n",
- " country_indeed=\"USA\",\n",
- " hyperlinks=True,\n",
- " is_remote=True,\n",
- " results_wanted=5, \n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "fe8289bc-5b64-4202-9a64-7c117c83fd9a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# use if hyperlinks=True\n",
- "html = jobs.to_html(escape=False)\n",
- "# change max-width: 200px to show more or less of the content\n",
- "truncate_width = f'{html}'\n",
- "display(HTML(truncate_width))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "951c2fe1-52ff-407d-8bb1-068049b36777",
- "metadata": {},
- "outputs": [],
- "source": [
- "# example 3 - with hyperlinks, international - linkedin (no zip_recruiter)\n",
- "jobs = scrape_jobs(\n",
- " site_name=[\"linkedin\"],\n",
- " location='berlin',\n",
- " search_term=\"engineer\",\n",
- " hyperlinks=True,\n",
- " results_wanted=5,\n",
- " easy_apply=True\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1e37a521-caef-441c-8fc2-2eb5b2e7da62",
- "metadata": {},
- "outputs": [],
- "source": [
- "# use if hyperlinks=True\n",
- "html = jobs.to_html(escape=False)\n",
- "# change max-width: 200px to show more or less of the content\n",
- "truncate_width = f'{html}'\n",
- "display(HTML(truncate_width))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0650e608-0b58-4bf5-ae86-68348035b16a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# example 4 - international indeed (no zip_recruiter)\n",
- "jobs = scrape_jobs(\n",
- " site_name=[\"indeed\"],\n",
- " search_term=\"engineer\",\n",
- " country_indeed = \"China\",\n",
- " hyperlinks=True\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "40913ac8-3f8a-4d7e-ac47-afb88316432b",
- "metadata": {},
- "outputs": [],
- "source": [
- "# use if hyperlinks=True\n",
- "html = jobs.to_html(escape=False)\n",
- "# change max-width: 200px to show more or less of the content\n",
- "truncate_width = f'{html}'\n",
- "display(HTML(truncate_width))"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/JobSpy_LongScrape.py b/examples/JobSpy_LongScrape.py
deleted file mode 100644
index d0ac0f8..0000000
--- a/examples/JobSpy_LongScrape.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from jobspy import scrape_jobs
-import pandas as pd
-import os
-import time
-
-# creates csv a new filename if the jobs.csv already exists.
-csv_filename = "jobs.csv"
-counter = 1
-while os.path.exists(csv_filename):
- csv_filename = f"jobs_{counter}.csv"
- counter += 1
-
-# results wanted and offset
-results_wanted = 1000
-offset = 0
-
-all_jobs = []
-
-# max retries
-max_retries = 3
-
-# nuumber of results at each iteration
-results_in_each_iteration = 30
-
-while len(all_jobs) < results_wanted:
- retry_count = 0
- while retry_count < max_retries:
- print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs")
- try:
- jobs = scrape_jobs(
- site_name=["indeed"],
- search_term="software engineer",
- # New York, NY
- # Dallas, TX
- # Los Angeles, CA
- location="Los Angeles, CA",
- results_wanted=min(
- results_in_each_iteration, results_wanted - len(all_jobs)
- ),
- country_indeed="USA",
- offset=offset,
- # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
- )
-
- # Add the scraped jobs to the list
- all_jobs.extend(jobs.to_dict("records"))
-
- # Increment the offset for the next page of results
- offset += results_in_each_iteration
-
- # Add a delay to avoid rate limiting (you can adjust the delay time as needed)
- print(f"Scraped {len(all_jobs)} jobs")
- print("Sleeping secs", 100 * (retry_count + 1))
- time.sleep(100 * (retry_count + 1)) # Sleep for 2 seconds between requests
-
- break # Break out of the retry loop if successful
- except Exception as e:
- print(f"Error: {e}")
- retry_count += 1
- print("Sleeping secs before retry", 100 * (retry_count + 1))
- time.sleep(100 * (retry_count + 1))
- if retry_count >= max_retries:
- print("Max retries reached. Exiting.")
- break
-
-# DataFrame from the collected job data
-jobs_df = pd.DataFrame(all_jobs)
-
-# Formatting
-pd.set_option("display.max_columns", None)
-pd.set_option("display.max_rows", None)
-pd.set_option("display.width", None)
-pd.set_option("display.max_colwidth", 50)
-
-print(jobs_df)
-
-jobs_df.to_csv(csv_filename, index=False)
-print(f"Outputted to {csv_filename}")
diff --git a/pyproject.toml b/pyproject.toml
index cb275fb..f94ae2e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
-version = "1.1.53"
+version = "1.1.54"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton ", "Cullen Watson "]
homepage = "https://github.com/Bunsly/JobSpy"
diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
index a2656cb..4ad1f74 100644
--- a/src/jobspy/__init__.py
+++ b/src/jobspy/__init__.py
@@ -30,7 +30,7 @@ def scrape_jobs(
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False,
- proxy: str | None = None,
+ proxies: list[str] | str | None = None,
description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None,
@@ -96,7 +96,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
- scraper = scraper_class(proxy=proxy)
+ scraper = scraper_class(proxies=proxies)
scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py
index 0ff2382..af278d7 100644
--- a/src/jobspy/scrapers/__init__.py
+++ b/src/jobspy/scrapers/__init__.py
@@ -39,9 +39,9 @@ class ScraperInput(BaseModel):
class Scraper(ABC):
- def __init__(self, site: Site, proxy: list[str] | None = None):
+ def __init__(self, site: Site, proxies: list[str] | None = None):
+ self.proxies = proxies
self.site = site
- self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py
index 89f5a95..b0dd733 100644
--- a/src/jobspy/scrapers/glassdoor/__init__.py
+++ b/src/jobspy/scrapers/glassdoor/__init__.py
@@ -34,12 +34,12 @@ from ...jobs import (
class GlassdoorScraper(Scraper):
- def __init__(self, proxy: Optional[str] = None):
+ def __init__(self, proxies: list[str] | str | None = None):
"""
Initializes GlassdoorScraper with the Glassdoor job search url
"""
site = Site(Site.GLASSDOOR)
- super().__init__(site, proxy=proxy)
+ super().__init__(site, proxies=proxies)
self.base_url = None
self.country = None
@@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper):
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url()
- self.session = create_session(self.proxy, is_tls=True, has_retry=True)
+ self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True)
token = self._get_csrf_token()
self.headers["gd-csrf-token"] = token if token else self.fallback_token
@@ -245,7 +245,6 @@ class GlassdoorScraper(Scraper):
if not location or is_remote:
return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
- session = create_session(self.proxy, has_retry=True)
res = self.session.get(url, headers=self.headers)
if res.status_code != 200:
if res.status_code == 429:
diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py
index 58303f5..b5d6cd6 100644
--- a/src/jobspy/scrapers/indeed/__init__.py
+++ b/src/jobspy/scrapers/indeed/__init__.py
@@ -12,14 +12,13 @@ from typing import Tuple
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, Future
-import requests
-
from .. import Scraper, ScraperInput, Site
from ..utils import (
extract_emails_from_text,
get_enum_from_job_type,
markdown_converter,
logger,
+ create_session,
)
from ...jobs import (
JobPost,
@@ -33,10 +32,13 @@ from ...jobs import (
class IndeedScraper(Scraper):
- def __init__(self, proxy: str | None = None):
+ def __init__(self, proxies: list[str] | str | None = None):
"""
Initializes IndeedScraper with the Indeed API url
"""
+ super().__init__(Site.INDEED, proxies=proxies)
+
+ self.session = create_session(proxies=self.proxies, is_tls=False)
self.scraper_input = None
self.jobs_per_page = 100
self.num_workers = 10
@@ -45,8 +47,6 @@ class IndeedScraper(Scraper):
self.api_country_code = None
self.base_url = None
self.api_url = "https://apis.indeed.com/graphql"
- site = Site(Site.INDEED)
- super().__init__(site, proxy=proxy)
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@@ -90,13 +90,13 @@ class IndeedScraper(Scraper):
jobs = []
new_cursor = None
filters = self._build_filters()
- search_term = self.scraper_input.search_term.replace('"', '\\"') if self.scraper_input.search_term else ""
+ search_term = (
+ self.scraper_input.search_term.replace('"', '\\"')
+ if self.scraper_input.search_term
+ else ""
+ )
query = self.job_search_query.format(
- what=(
- f'what: "{search_term}"'
- if search_term
- else ""
- ),
+ what=(f'what: "{search_term}"' if search_term else ""),
location=(
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
if self.scraper_input.location
@@ -111,11 +111,10 @@ class IndeedScraper(Scraper):
}
api_headers = self.api_headers.copy()
api_headers["indeed-co"] = self.api_country_code
- response = requests.post(
+ response = self.session.post(
self.api_url,
headers=api_headers,
json=payload,
- proxies=self.proxy,
timeout=10,
)
if response.status_code != 200:
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
index 18fbb84..840b2fb 100644
--- a/src/jobspy/scrapers/linkedin/__init__.py
+++ b/src/jobspy/scrapers/linkedin/__init__.py
@@ -10,14 +10,13 @@ from __future__ import annotations
import time
import random
import regex as re
-import urllib.parse
from typing import Optional
from datetime import datetime
from threading import Lock
from bs4.element import Tag
from bs4 import BeautifulSoup
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import urlparse, urlunparse, unquote
from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException
@@ -46,11 +45,19 @@ class LinkedInScraper(Scraper):
band_delay = 4
jobs_per_page = 25
- def __init__(self, proxy: Optional[str] = None):
+ def __init__(self, proxies: list[str] | str | None = None):
"""
Initializes LinkedInScraper with the LinkedIn job search url
"""
- super().__init__(Site(Site.LINKEDIN), proxy=proxy)
+ super().__init__(Site.LINKEDIN, proxies=proxies)
+ self.session = create_session(
+ proxies=self.proxies,
+ is_tls=False,
+ has_retry=True,
+ delay=5,
+ clear_cookies=True,
+ )
+ self.session.headers.update(self.headers)
self.scraper_input = None
self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
@@ -74,7 +81,6 @@ class LinkedInScraper(Scraper):
)
while continue_search():
logger.info(f"LinkedIn search page: {page // 25 + 1}")
- session = create_session(is_tls=False, has_retry=True, delay=5)
params = {
"keywords": scraper_input.search_term,
"location": scraper_input.location,
@@ -99,12 +105,9 @@ class LinkedInScraper(Scraper):
params = {k: v for k, v in params.items() if v is not None}
try:
- response = session.get(
+ response = self.session.get(
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params,
- allow_redirects=True,
- proxies=self.proxy,
- headers=self.headers,
timeout=10,
)
if response.status_code not in range(200, 400):
@@ -241,10 +244,7 @@ class LinkedInScraper(Scraper):
:return: dict
"""
try:
- session = create_session(is_tls=False, has_retry=True)
- response = session.get(
- job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
- )
+ response = self.session.get(job_page_url, timeout=5)
response.raise_for_status()
except:
return {}
@@ -340,7 +340,7 @@ class LinkedInScraper(Scraper):
job_url_direct_content.decode_contents().strip()
)
if job_url_direct_match:
- job_url_direct = urllib.parse.unquote(job_url_direct_match.group())
+ job_url_direct = unquote(job_url_direct_match.group())
return job_url_direct
diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py
index 8fef421..294d20c 100644
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -2,6 +2,8 @@ from __future__ import annotations
import re
import logging
+from itertools import cycle
+
import requests
import tls_client
import numpy as np
@@ -21,6 +23,104 @@ if not logger.handlers:
logger.addHandler(console_handler)
+class RotatingProxySession:
+ def __init__(self, proxies=None):
+ if isinstance(proxies, str):
+ self.proxy_cycle = cycle([self.format_proxy(proxies)])
+ elif isinstance(proxies, list):
+ self.proxy_cycle = (
+ cycle([self.format_proxy(proxy) for proxy in proxies])
+ if proxies
+ else None
+ )
+ else:
+ self.proxy_cycle = None
+
+ @staticmethod
+ def format_proxy(proxy):
+ """Utility method to format a proxy string into a dictionary."""
+ if proxy.startswith("http://") or proxy.startswith("https://"):
+ return {"http": proxy, "https": proxy}
+ return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
+
+
+class RequestsRotating(RotatingProxySession, requests.Session):
+
+ def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
+ RotatingProxySession.__init__(self, proxies=proxies)
+ requests.Session.__init__(self)
+ self.clear_cookies = clear_cookies
+ self.allow_redirects = True
+ self.setup_session(has_retry, delay)
+
+ def setup_session(self, has_retry, delay):
+ if has_retry:
+ retries = Retry(
+ total=3,
+ connect=3,
+ status=3,
+ status_forcelist=[500, 502, 503, 504, 429],
+ backoff_factor=delay,
+ )
+ adapter = HTTPAdapter(max_retries=retries)
+ self.mount("http://", adapter)
+ self.mount("https://", adapter)
+
+ def request(self, method, url, **kwargs):
+ if self.clear_cookies:
+ self.cookies.clear()
+
+ if self.proxy_cycle:
+ next_proxy = next(self.proxy_cycle)
+ if next_proxy["http"] != "http://localhost":
+ self.proxies = next_proxy
+ else:
+ self.proxies = {}
+ return requests.Session.request(self, method, url, **kwargs)
+
+
+class TLSRotating(RotatingProxySession, tls_client.Session):
+
+ def __init__(self, proxies=None):
+ RotatingProxySession.__init__(self, proxies=proxies)
+ tls_client.Session.__init__(self, random_tls_extension_order=True)
+
+ def execute_request(self, *args, **kwargs):
+ if self.proxy_cycle:
+ next_proxy = next(self.proxy_cycle)
+ if next_proxy["http"] != "http://localhost":
+ self.proxies = next_proxy
+ else:
+ self.proxies = {}
+ response = tls_client.Session.execute_request(self, *args, **kwargs)
+ return response
+
+
+def create_session(
+ *,
+ proxies: dict | str | None = None,
+ is_tls: bool = True,
+ has_retry: bool = False,
+ delay: int = 1,
+ clear_cookies: bool = False,
+) -> requests.Session:
+ """
+ Creates a requests session with optional tls, proxy, and retry settings.
+ :return: A session object
+ """
+ if is_tls:
+ session = TLSRotating(proxies=proxies)
+ else:
+ session = RequestsRotating(
+ proxies=proxies,
+ has_retry=has_retry,
+ delay=delay,
+ clear_cookies=clear_cookies,
+ )
+
+ return session
+
+
def set_logger_level(verbose: int = 2):
"""
Adjusts the logger's level. This function allows the logging level to be changed at runtime.
@@ -52,39 +152,6 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text)
-def create_session(
- proxy: dict | None = None,
- is_tls: bool = True,
- has_retry: bool = False,
- delay: int = 1,
-) -> requests.Session:
- """
- Creates a requests session with optional tls, proxy, and retry settings.
- :return: A session object
- """
- if is_tls:
- session = tls_client.Session(random_tls_extension_order=True)
- session.proxies = proxy
- else:
- session = requests.Session()
- session.allow_redirects = True
- if proxy:
- session.proxies.update(proxy)
- if has_retry:
- retries = Retry(
- total=3,
- connect=3,
- status=3,
- status_forcelist=[500, 502, 503, 504, 429],
- backoff_factor=delay,
- )
- adapter = HTTPAdapter(max_retries=retries)
-
- session.mount("http://", adapter)
- session.mount("https://", adapter)
- return session
-
-
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
"""
Given a string, returns the corresponding JobType enum member if a match is found.
diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py
index fbe896f..7bf51bf 100644
--- a/src/jobspy/scrapers/ziprecruiter/__init__.py
+++ b/src/jobspy/scrapers/ziprecruiter/__init__.py
@@ -36,14 +36,15 @@ class ZipRecruiterScraper(Scraper):
base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com"
- def __init__(self, proxy: Optional[str] = None):
+ def __init__(self, proxies: list[str] | str | None = None):
"""
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
"""
+ super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
+
self.scraper_input = None
- self.session = create_session(proxy)
+ self.session = create_session(proxies=proxies)
self._get_cookies()
- super().__init__(Site.ZIP_RECRUITER, proxy=proxy)
self.delay = 5
self.jobs_per_page = 20
@@ -151,7 +152,7 @@ class ZipRecruiterScraper(Scraper):
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
comp_currency = job.get("compensation_currency")
return JobPost(
- id=str(job['listing_key']),
+ id=str(job["listing_key"]),
title=title,
company_name=company,
location=location,