From 89a5264391d9f47781112bd942c347b3062a4358 Mon Sep 17 00:00:00 2001 From: Harish Vadaparty Date: Fri, 12 Jan 2024 23:54:00 +0530 Subject: [PATCH] add long scrape example (#81) --- .../{JobSpy_Demo.py => JobSpy_AllSites.py} | 7 +- examples/JobSpy_LongScrape.py | 77 +++++++++++++++++++ 2 files changed, 80 insertions(+), 4 deletions(-) rename examples/{JobSpy_Demo.py => JobSpy_AllSites.py} (72%) create mode 100644 examples/JobSpy_LongScrape.py diff --git a/examples/JobSpy_Demo.py b/examples/JobSpy_AllSites.py similarity index 72% rename from examples/JobSpy_Demo.py rename to examples/JobSpy_AllSites.py index c982793..22deea0 100644 --- a/examples/JobSpy_Demo.py +++ b/examples/JobSpy_AllSites.py @@ -2,12 +2,11 @@ from jobspy import scrape_jobs import pandas as pd jobs: pd.DataFrame = scrape_jobs( - site_name=["indeed", "linkedin", "zip_recruiter"], + site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"], search_term="software engineer", location="Dallas, TX", - results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) + results_wanted=25, # be wary the higher it is, the more likey you'll get blocked (rotating proxy can help tho) country_indeed="USA", - offset=25 # start jobs from an offset (use if search failed and want to continue) # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", ) @@ -28,4 +27,4 @@ print("outputted to jobs.csv") # jobs.to_xlsx('jobs.xlsx', index=False) # 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) -# display(jobs) +# display(jobs) \ No newline at end of file diff --git a/examples/JobSpy_LongScrape.py b/examples/JobSpy_LongScrape.py new file mode 100644 index 0000000..189ca81 --- /dev/null +++ b/examples/JobSpy_LongScrape.py @@ -0,0 +1,77 @@ +from jobspy import scrape_jobs +import pandas as pd +import os +import time + +# creates csv a new filename if the jobs.csv already exists. +csv_filename = "jobs.csv" +counter = 1 +while os.path.exists(csv_filename): + csv_filename = f"jobs_{counter}.csv" + counter += 1 + +# results wanted and offset +results_wanted = 1000 +offset = 0 + +all_jobs = [] + +# max retries +max_retries = 3 + +# nuumber of results at each iteration +results_in_each_iteration = 30 + +while len(all_jobs) < results_wanted: + retry_count = 0 + while retry_count < max_retries: + print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs") + try: + jobs = scrape_jobs( + site_name=["indeed"], + search_term="software engineer", + # New York, NY + # Dallas, TX + + # Los Angeles, CA + location="Los Angeles, CA", + results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)), + country_indeed="USA", + offset=offset, + # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", + ) + + # Add the scraped jobs to the list + all_jobs.extend(jobs.to_dict('records')) + + # Increment the offset for the next page of results + offset += results_in_each_iteration + + # Add a delay to avoid rate limiting (you can adjust the delay time as needed) + print(f"Scraped {len(all_jobs)} jobs") + print("Sleeping secs", 100 * (retry_count + 1)) + time.sleep(100 * (retry_count + 1)) # Sleep for 2 seconds between requests + + break # Break out of the retry loop if successful + except Exception as e: + print(f"Error: {e}") + retry_count += 1 + print("Sleeping secs before retry", 100 * (retry_count + 1)) + time.sleep(100 * (retry_count + 1)) + if retry_count >= max_retries: + print("Max retries reached. Exiting.") + break + +# DataFrame from the collected job data +jobs_df = pd.DataFrame(all_jobs) + +# Formatting +pd.set_option("display.max_columns", None) +pd.set_option("display.max_rows", None) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", 50) + +print(jobs_df) + +jobs_df.to_csv(csv_filename, index=False) +print(f"Outputted to {csv_filename}")