diff --git a/examples/JobSpy_Demo.py b/examples/JobSpy_Demo.py index c982793..23f3c2f 100644 --- a/examples/JobSpy_Demo.py +++ b/examples/JobSpy_Demo.py @@ -1,31 +1,77 @@ from jobspy import scrape_jobs import pandas as pd +import os +import time -jobs: pd.DataFrame = scrape_jobs( - site_name=["indeed", "linkedin", "zip_recruiter"], - search_term="software engineer", - location="Dallas, TX", - results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) - country_indeed="USA", - offset=25 # start jobs from an offset (use if search failed and want to continue) - # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", -) +# creates csv a new filename if the jobs.csv already exists. +csv_filename = "jobs.csv" +counter = 1 +while os.path.exists(csv_filename): + csv_filename = f"jobs_{counter}.csv" + counter += 1 -# formatting for pandas +# results wanted and offset +results_wanted = 1000 +offset = 0 + +all_jobs = [] + +# max retries +max_retries = 3 + +# nuumber of results at each iteration +results_in_each_iteration = 50 + +while len(all_jobs) < results_wanted: + retry_count = 0 + while retry_count < max_retries: + print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs") + try: + jobs = scrape_jobs( + site_name=["indeed"], + search_term="software engineer", + # New York, NY + # Dallas, TX + + # Los Angeles, CA + location="Los Angeles, CA", + results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)), + country_indeed="USA", + offset=offset, + # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", + ) + + # Add the scraped jobs to the list + all_jobs.extend(jobs.to_dict('records')) + + # Increment the offset for the next page of results + offset += results_in_each_iteration + + # Add a delay to avoid rate limiting (you can adjust the delay time as needed) + print(f"Scraped {len(all_jobs)} jobs") + print("Sleeping secs", 100 * (retry_count + 1)) + time.sleep(100 * (retry_count + 1)) # Sleep for 2 seconds between requests + + break # Break out of the retry loop if successful + except Exception as e: + print(f"Error: {e}") + retry_count += 1 + print("Sleeping secs before retry", 100 * (retry_count + 1)) + time.sleep(100 * (retry_count + 1)) + if retry_count >= max_retries: + print("Max retries reached. Exiting.") + break + +# DataFrame from the collected job data +jobs_df = pd.DataFrame(all_jobs) + +# Formatting pd.set_option("display.max_columns", None) pd.set_option("display.max_rows", None) pd.set_option("display.width", None) -pd.set_option("display.max_colwidth", 50) # set to 0 to see full job url / desc +pd.set_option("display.max_colwidth", 50) -# 1: output to console -print(jobs) +print(jobs_df) -# 2: output to .csv -jobs.to_csv("./jobs.csv", index=False) -print("outputted to jobs.csv") - -# 3: output to .xlsx -# jobs.to_xlsx('jobs.xlsx', index=False) - -# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) -# display(jobs) +jobs_df.to_csv(csv_filename, index=False) +print(f"Outputted to {csv_filename}")