LinkedIn scraper fixes: (#159)

Correct initial page offset calculation
Separate page variable from request counter
Fix job offset starting value
Increment offset by number of jobs returned instead of expected value
This commit is contained in:
adamagassi
2024-05-28 23:38:13 +03:00
committed by GitHub
parent 5cb7ffe5fd
commit 7f6271b2e0

View File

@@ -72,7 +72,8 @@ class LinkedInScraper(Scraper):
job_list: list[JobPost] = [] job_list: list[JobPost] = []
seen_urls = set() seen_urls = set()
url_lock = Lock() url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0 page = scraper_input.offset // 25 * 25 if scraper_input.offset else 0
request_count = 0
seconds_old = ( seconds_old = (
scraper_input.hours_old * 3600 if scraper_input.hours_old else None scraper_input.hours_old * 3600 if scraper_input.hours_old else None
) )
@@ -80,7 +81,8 @@ class LinkedInScraper(Scraper):
lambda: len(job_list) < scraper_input.results_wanted and page < 1000 lambda: len(job_list) < scraper_input.results_wanted and page < 1000
) )
while continue_search(): while continue_search():
logger.info(f"LinkedIn search page: {page // 25 + 1}") request_count += 1
logger.info(f"LinkedIn search page: {request_count}")
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@@ -92,7 +94,7 @@ class LinkedInScraper(Scraper):
else None else None
), ),
"pageNum": 0, "pageNum": 0,
"start": page + scraper_input.offset, "start": page,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ( "f_C": (
",".join(map(str, scraper_input.linkedin_company_ids)) ",".join(map(str, scraper_input.linkedin_company_ids))
@@ -156,7 +158,7 @@ class LinkedInScraper(Scraper):
if continue_search(): if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += self.jobs_per_page page += len(job_list)
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)