added search + exceptions

Cannot get timeout exception to work properly still: File "/home/guest/Desktop/Job Crawler/venv/lib/python3.10/site-packages/jobspy/scrapers/glassdoor/__init__.py", line 99, in scrape
    raise GlassdoorException(str(e))
jobspy.scrapers.exceptions.GlassdoorException: HTTPSConnectionPool(host='www.glassdoor.com', port=443): Read timed out. (read timeout=10)
pull/120/head
troy-conte 2024-03-01 17:12:53 -05:00 committed by GitHub
parent 2845cc9865
commit e99ea38a5f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 16 additions and 7 deletions

View File

@ -28,7 +28,6 @@ from ...jobs import (
DescriptionFormat DescriptionFormat
) )
class GlassdoorScraper(Scraper): class GlassdoorScraper(Scraper):
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
@ -62,10 +61,13 @@ class GlassdoorScraper(Scraper):
all_jobs: list[JobPost] = [] all_jobs: list[JobPost] = []
cursor = None cursor = None
max_pages = 30 max_pages = 30
urlCount = 0
self.session = create_session(self.proxy, is_tls=False, has_retry=True) self.session = create_session(self.proxy, is_tls=False, has_retry=True)
self.session.get(self.base_url) self.session.get(self.base_url)
try: try:
print(f'Glassdoor searches: {urlCount}')
urlCount += 1
for page in range( for page in range(
1 + (scraper_input.offset // self.jobs_per_page), 1 + (scraper_input.offset // self.jobs_per_page),
min( min(
@ -81,12 +83,20 @@ class GlassdoorScraper(Scraper):
if len(all_jobs) >= scraper_input.results_wanted: if len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted] all_jobs = all_jobs[: scraper_input.results_wanted]
break break
except TimeoutError as timeout_exception: # Specific exception for timeouts except requests.exceptions.ReadTimeout as timeout_exception:
print(f"Timeout occurred on page {page}: {str(timeout_exception)}") logger.error(f"Timeout occurred on page {page}: {str(timeout_exception)}")
# Skip this page and continue to the next # Skip this page and continue to the next
continue continue
except Exception as e: except Exception as e:
raise GlassdoorException(str(e)) raise GlassdoorException(str(e))
except requests.exceptions.ReadTimeout as timeout_exception:
logger.error(f"Timeout occurred on page {page}: {str(timeout_exception)}")
except requests.exceptions.HTTPError as http_error:
if http_error.response.status_code == 502:
logger.error(f"Bad Gateway (502) encountered: {str(http_error)}")
# Decide on a retry mechanism, log the error, or take another appropriate action
else:
raise # Re-raise the exception if it's not a 502 error
except Exception as e: except Exception as e:
raise GlassdoorException(str(e)) raise GlassdoorException(str(e))
return JobResponse(jobs=all_jobs) return JobResponse(jobs=all_jobs)
@ -103,10 +113,9 @@ class GlassdoorScraper(Scraper):
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
""" """
self.scraper_input = scraper_input self.scraper_input = scraper_input
urlCount = 0
try: try:
logger.error(f'Glassdoor searches: {urlCount}')
urlCount+=1
payload = self._add_payload( payload = self._add_payload(
location_id, location_type, page_num, cursor location_id, location_type, page_num, cursor
) )