Compare commits

..

3 Commits

Author SHA1 Message Date
gigaSec
a4f6851c32 Fix GlassDoor Country Vietnam(#122) 2024-03-04 17:35:57 -06:00
troy-conte
db01bc6bbb log search updates, fix glassdoor (#120) 2024-03-04 16:39:38 -06:00
Cullen Watson
f8a4eccc6b Remove pandas warning (#118) 2024-02-29 21:30:56 -06:00
10 changed files with 193 additions and 176 deletions

View File

@@ -104,15 +104,6 @@ JobPost
└── is_remote (bool) └── is_remote (bool)
``` ```
### Exceptions
The following exceptions may be raised when using JobSpy:
* `LinkedInException`
* `IndeedException`
* `ZipRecruiterException`
* `GlassdoorException`
## Supported Countries for Job Searching ## Supported Countries for Job Searching
### **LinkedIn** ### **LinkedIn**
@@ -147,7 +138,7 @@ You can specify the following countries when searching on Indeed (use the exact
| South Korea | Spain* | Sweden | Switzerland* | | South Korea | Spain* | Sweden | Switzerland* |
| Taiwan | Thailand | Turkey | Ukraine | | Taiwan | Thailand | Turkey | Ukraine |
| United Arab Emirates | UK* | USA* | Uruguay | | United Arab Emirates | UK* | USA* | Uruguay |
| Venezuela | Vietnam | | | | Venezuela | Vietnam* | | |
Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search. Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
@@ -167,8 +158,4 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
- Waiting some time between scrapes (site-dependent). - Waiting some time between scrapes (site-dependent).
- Trying a VPN or proxy to change your IP address. - Trying a VPN or proxy to change your IP address.
--- ---

34
poetry.lock generated
View File

@@ -524,17 +524,6 @@ files = [
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
] ]
[[package]]
name = "html2text"
version = "2020.1.16"
description = "Turn HTML into equivalent Markdown-structured text."
optional = false
python-versions = ">=3.5"
files = [
{file = "html2text-2020.1.16-py3-none-any.whl", hash = "sha256:c7c629882da0cf377d66f073329ccf34a12ed2adf0169b9285ae4e63ef54c82b"},
{file = "html2text-2020.1.16.tar.gz", hash = "sha256:e296318e16b059ddb97f7a8a1d6a5c1d7af4544049a01e261731d2d5cc277bbb"},
]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.4" version = "3.4"
@@ -1037,6 +1026,21 @@ files = [
{file = "jupyterlab_widgets-3.0.8.tar.gz", hash = "sha256:d428ab97b8d87cc7c54cbf37644d6e0f0e662f23876e05fa460a73ec3257252a"}, {file = "jupyterlab_widgets-3.0.8.tar.gz", hash = "sha256:d428ab97b8d87cc7c54cbf37644d6e0f0e662f23876e05fa460a73ec3257252a"},
] ]
[[package]]
name = "markdownify"
version = "0.11.6"
description = "Convert HTML to markdown."
optional = false
python-versions = "*"
files = [
{file = "markdownify-0.11.6-py3-none-any.whl", hash = "sha256:ba35fe289d5e9073bcd7d2cad629278fe25f1a93741fcdc0bfb4f009076d8324"},
{file = "markdownify-0.11.6.tar.gz", hash = "sha256:009b240e0c9f4c8eaf1d085625dcd4011e12f0f8cec55dedf9ea6f7655e49bfe"},
]
[package.dependencies]
beautifulsoup4 = ">=4.9,<5"
six = ">=1.15,<2"
[[package]] [[package]]
name = "markupsafe" name = "markupsafe"
version = "2.1.3" version = "2.1.3"
@@ -2271,13 +2275,13 @@ test = ["flake8", "isort", "pytest"]
[[package]] [[package]]
name = "tls-client" name = "tls-client"
version = "1.0" version = "1.0.1"
description = "Advanced Python HTTP Client." description = "Advanced Python HTTP Client."
optional = false optional = false
python-versions = "*" python-versions = "*"
files = [ files = [
{file = "tls_client-1.0-py3-none-any.whl", hash = "sha256:f1183f5e18cb31914bd62d11b350a33ea0293ea80fb91d69a3072821dece3e66"}, {file = "tls_client-1.0.1-py3-none-any.whl", hash = "sha256:2f8915c0642c2226c9e33120072a2af082812f6310d32f4ea4da322db7d3bb1c"},
{file = "tls_client-1.0.tar.gz", hash = "sha256:7f6de48ad4a0ef69b72682c76ce604155971e07b4bfb2148a36276194ae3e7a0"}, {file = "tls_client-1.0.1.tar.gz", hash = "sha256:dad797f3412bb713606e0765d489f547ffb580c5ffdb74aed47a183ce8505ff5"},
] ]
[[package]] [[package]]
@@ -2446,4 +2450,4 @@ files = [
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "40cdc19a57cba0d21ff4f0fcfa53e14a073fcccd9f2a871440e056ab6e8fade0" content-hash = "ba7f7cc9b6833a4a6271981f90610395639dd8b9b3db1370cbd1149d70cc9632"

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.45" version = "1.1.47"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"
@@ -13,12 +13,12 @@ packages = [
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.10" python = "^3.10"
requests = "^2.31.0" requests = "^2.31.0"
tls-client = "*"
beautifulsoup4 = "^4.12.2" beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0" pandas = "^2.1.0"
NUMPY = "1.24.2" NUMPY = "1.24.2"
pydantic = "^2.3.0" pydantic = "^2.3.0"
html2text = "^2020.1.16" tls-client = "^1.0.1"
markdownify = "^0.11.6"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@@ -152,8 +152,14 @@ def scrape_jobs(
jobs_dfs.append(job_df) jobs_dfs.append(job_df)
if jobs_dfs: if jobs_dfs:
jobs_df = pd.concat(jobs_dfs, ignore_index=True) # Step 1: Filter out all-NA columns from each DataFrame before concatenation
desired_order: list[str] = [ filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]
# Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
# Desired column order
desired_order = [
"job_url_hyper" if hyperlinks else "job_url", "job_url_hyper" if hyperlinks else "job_url",
"site", "site",
"title", "title",
@@ -172,6 +178,16 @@ def scrape_jobs(
"emails", "emails",
"description", "description",
] ]
return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False])
# Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order:
if column not in jobs_df.columns:
jobs_df[column] = None # Add missing columns as empty
# Reorder the DataFrame according to the desired order
jobs_df = jobs_df[desired_order]
# Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
else: else:
return pd.DataFrame() return pd.DataFrame()

View File

@@ -122,7 +122,7 @@ class Country(Enum):
USA = ("usa,us,united states", "www", "com") USA = ("usa,us,united states", "www", "com")
URUGUAY = ("uruguay", "uy") URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve") VENEZUELA = ("venezuela", "ve")
VIETNAM = ("vietnam", "vn") VIETNAM = ("vietnam", "vn", "com")
# internal for ziprecruiter # internal for ziprecruiter
US_CANADA = ("usa/ca", "www") US_CANADA = ("usa/ca", "www")
@@ -145,7 +145,7 @@ class Country(Enum):
else: else:
raise Exception(f"Glassdoor is not available for {self.name}") raise Exception(f"Glassdoor is not available for {self.name}")
def get_url(self): def get_glassdoor_url(self):
return f"https://{self.glassdoor_domain_value}/" return f"https://{self.glassdoor_domain_value}/"
@classmethod @classmethod

View File

@@ -5,6 +5,8 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
""" """
import json import json
import re
import requests import requests
from typing import Optional from typing import Optional
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -42,6 +44,7 @@ class GlassdoorScraper(Scraper):
self.session = None self.session = None
self.scraper_input = None self.scraper_input = None
self.jobs_per_page = 30 self.jobs_per_page = 30
self.max_pages = 30
self.seen_urls = set() self.seen_urls = set()
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
@@ -52,39 +55,40 @@ class GlassdoorScraper(Scraper):
""" """
self.scraper_input = scraper_input self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_url() self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
token = self._get_csrf_token()
self.headers['gd-csrf-token'] = token if token else self.fallback_token
location_id, location_type = self._get_location( location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote scraper_input.location, scraper_input.is_remote
) )
if location_type is None: if location_type is None:
logger.error('Glassdoor: location not parsed')
return JobResponse(jobs=[]) return JobResponse(jobs=[])
all_jobs: list[JobPost] = [] all_jobs: list[JobPost] = []
cursor = None cursor = None
max_pages = 30
self.session = create_session(self.proxy, is_tls=False, has_retry=True)
self.session.get(self.base_url)
try: for page in range(
for page in range( 1 + (scraper_input.offset // self.jobs_per_page),
1 + (scraper_input.offset // self.jobs_per_page), min(
min( (scraper_input.results_wanted // self.jobs_per_page) + 2,
(scraper_input.results_wanted // self.jobs_per_page) + 2, self.max_pages + 1,
max_pages + 1, ),
), ):
): logger.info(f'Glassdoor search page: {page}')
try: try:
jobs, cursor = self._fetch_jobs_page( jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor scraper_input, location_id, location_type, page, cursor
) )
all_jobs.extend(jobs) all_jobs.extend(jobs)
if len(all_jobs) >= scraper_input.results_wanted: if not jobs or len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted] all_jobs = all_jobs[: scraper_input.results_wanted]
break break
except Exception as e: except Exception as e:
raise GlassdoorException(str(e)) logger.error(f'Glassdoor: {str(e)}')
except Exception as e: break
raise GlassdoorException(str(e))
return JobResponse(jobs=all_jobs) return JobResponse(jobs=all_jobs)
def _fetch_jobs_page( def _fetch_jobs_page(
@@ -98,27 +102,26 @@ class GlassdoorScraper(Scraper):
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
""" """
jobs = []
self.scraper_input = scraper_input self.scraper_input = scraper_input
try: try:
payload = self._add_payload( payload = self._add_payload(
location_id, location_type, page_num, cursor location_id, location_type, page_num, cursor
) )
response = self.session.post( response = self.session.post(
f"{self.base_url}/graph", headers=self.headers, timeout=10, data=payload f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload
) )
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException( raise GlassdoorException(f"bad response status code: {response.status_code}")
f"bad response status code: {response.status_code}"
)
res_json = response.json()[0] res_json = response.json()[0]
if "errors" in res_json: if "errors" in res_json:
raise ValueError("Error encountered in API response") raise ValueError("Error encountered in API response")
except Exception as e: except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e:
raise GlassdoorException(str(e)) logger.error(f'Glassdoor: {str(e)}')
return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"] jobs_data = res_json["data"]["jobListings"]["jobListings"]
jobs = []
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data} future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
for future in as_completed(future_to_job_data): for future in as_completed(future_to_job_data):
@@ -133,6 +136,18 @@ class GlassdoorScraper(Scraper):
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
) )
def _get_csrf_token(self):
"""
Fetches csrf token needed for API by visiting a generic page
"""
res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers)
pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text)
token = None
if matches:
token = matches[0]
return token
def _process_job(self, job_data): def _process_job(self, job_data):
""" """
Processes a single job and fetches its description. Processes a single job and fetches its description.
@@ -217,7 +232,7 @@ class GlassdoorScraper(Scraper):
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True) session = create_session(self.proxy, has_retry=True)
res = session.get(url) res = self.session.get(url, headers=self.headers)
if res.status_code != 200: if res.status_code != 200:
if res.status_code == 429: if res.status_code == 429:
logger.error(f'429 Response - Blocked by Glassdoor for too many requests') logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
@@ -266,7 +281,74 @@ class GlassdoorScraper(Scraper):
"fromage": fromage, "fromage": fromage,
"sort": "date" "sort": "date"
}, },
"query": """ "query": self.query_template
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok"
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
query_template = """
query JobSearchResultsQuery( query JobSearchResultsQuery(
$excludeJobListingIds: [Long!], $excludeJobListingIds: [Long!],
$keyword: String, $keyword: String,
@@ -431,70 +513,4 @@ class GlassdoorScraper(Scraper):
} }
__typename __typename
} }
""" """
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}

View File

@@ -15,7 +15,6 @@ from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException
from ..utils import ( from ..utils import (
count_urgent_words, count_urgent_words,
extract_emails_from_text, extract_emails_from_text,
@@ -63,8 +62,7 @@ class IndeedScraper(Scraper):
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted:
pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page) pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page)
new_jobs = False new_jobs = False
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
executor.submit(self._scrape_page, page + pages_processed) executor.submit(self._scrape_page, page + pages_processed)
for page in range(pages_to_process) for page in range(pages_to_process)
@@ -82,7 +80,6 @@ class IndeedScraper(Scraper):
if not new_jobs: if not new_jobs:
break break
if len(self.seen_urls) > scraper_input.results_wanted: if len(self.seen_urls) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted] job_list = job_list[:scraper_input.results_wanted]
@@ -94,10 +91,11 @@ class IndeedScraper(Scraper):
:param page: :param page:
:return: jobs found on page, total number of jobs found for search :return: jobs found on page, total number of jobs found for search
""" """
logger.info(f'Indeed search page: {page + 1}')
job_list = [] job_list = []
domain = self.scraper_input.country.indeed_domain_value domain = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com" self.base_url = f"https://{domain}.indeed.com"
try: try:
session = create_session(self.proxy) session = create_session(self.proxy)
response = session.get( response = session.get(
@@ -124,12 +122,15 @@ class IndeedScraper(Scraper):
return job_list return job_list
jobs = IndeedScraper._parse_jobs(soup) jobs = IndeedScraper._parse_jobs(soup)
if not jobs:
return []
if ( if (
not jobs.get("metaData", {}) not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {}) .get("mosaicProviderJobCardsModel", {})
.get("results") .get("results")
): ):
raise IndeedException("No jobs found.") logger.error("Indeed - No jobs found.")
return []
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
job_keys = [job['jobkey'] for job in jobs] job_keys = [job['jobkey'] for job in jobs]
@@ -139,7 +140,6 @@ class IndeedScraper(Scraper):
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed) executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
return job_list return job_list
@@ -302,11 +302,11 @@ class IndeedScraper(Scraper):
jobs = json.loads(m.group(1).strip()) jobs = json.loads(m.group(1).strip())
return jobs return jobs
else: else:
raise IndeedException("Could not find mosaic provider job cards data") logger.warning(f'Indeed: Could not find mosaic provider job cards data')
return {}
else: else:
raise IndeedException( logger.warning(f"Indeed: Could not parse any jobs on the page")
"Could not find any results for the search" return {}
)
@staticmethod @staticmethod
def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool: def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:

View File

@@ -9,8 +9,6 @@ import random
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
import requests
from requests.exceptions import ProxyError
from threading import Lock from threading import Lock
from bs4.element import Tag from bs4.element import Tag
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -41,15 +39,16 @@ from ..utils import (
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
base_url = "https://www.linkedin.com" base_url = "https://www.linkedin.com"
delay = 3 delay = 3
band_delay = 4
jobs_per_page = 25
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
self.scraper_input = None self.scraper_input = None
site = Site(Site.LINKEDIN)
self.country = "worldwide" self.country = "worldwide"
super().__init__(site, proxy=proxy)
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@@ -68,8 +67,8 @@ class LinkedInScraper(Scraper):
else None else None
) )
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000 continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
while continue_search(): while continue_search():
logger.info(f'LinkedIn search page: {page // 25 + 1}')
session = create_session(is_tls=False, has_retry=True, delay=5) session = create_session(is_tls=False, has_retry=True, delay=5)
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
@@ -83,8 +82,9 @@ class LinkedInScraper(Scraper):
"start": page + scraper_input.offset, "start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None, "f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None,
"f_TPR": f"r{seconds_old}",
} }
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
try: try:
@@ -101,13 +101,13 @@ class LinkedInScraper(Scraper):
logger.error(f'429 Response - Blocked by LinkedIn for too many requests') logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
else: else:
logger.error(f'LinkedIn response status code {response.status_code}') logger.error(f'LinkedIn response status code {response.status_code}')
return JobResponse(job_list=job_list) return JobResponse(jobs=job_list)
except Exception as e: except Exception as e:
if "Proxy responded with" in str(e): if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy') logger.error(f'LinkedIn: Bad proxy')
else: else:
logger.error(f'Indeed: {str(e)}') logger.error(f'LinkedIn: {str(e)}')
return JobResponse(job_list=job_list) return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card") job_cards = soup.find_all("div", class_="base-search-card")
@@ -136,8 +136,8 @@ class LinkedInScraper(Scraper):
raise LinkedInException(str(e)) raise LinkedInException(str(e))
if continue_search(): if continue_search():
time.sleep(random.uniform(self.delay, self.delay + 2)) time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += 25 page += self.jobs_per_page
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)

View File

@@ -1,21 +1,19 @@
import re
import logging import logging
import numpy as np import re
import html2text import numpy as np
import tls_client
import requests import requests
import tls_client
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType from ..jobs import JobType
text_maker = html2text.HTML2Text()
logger = logging.getLogger("JobSpy") logger = logging.getLogger("JobSpy")
logger.propagate = False logger.propagate = False
if not logger.handlers: if not logger.handlers:
logger.setLevel(logging.ERROR) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter) console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
@@ -37,13 +35,9 @@ def count_urgent_words(description: str) -> int:
def markdown_converter(description_html: str): def markdown_converter(description_html: str):
if description_html is None: if description_html is None:
return "" return None
text_maker.ignore_links = False markdown = md(description_html)
try: return markdown.strip()
markdown = text_maker.handle(description_html)
return markdown.strip()
except AssertionError as e:
return ""
def extract_emails_from_text(text: str) -> list[str] | None: def extract_emails_from_text(text: str) -> list[str] | None:

View File

@@ -63,7 +63,7 @@ class ZipRecruiterScraper(Scraper):
break break
if page > 1: if page > 1:
time.sleep(self.delay) time.sleep(self.delay)
logger.info(f'ZipRecruiter search page: {page}')
jobs_on_page, continue_token = self._find_jobs_in_page( jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token scraper_input, continue_token
) )