JobSpy/examples/JobSpy_LongScrape.py

from jobspy import scrape_jobs
import pandas as pd
import os
import time

# creates csv a new filename if the jobs.csv already exists.
csv_filename = "jobs.csv"
counter = 1
while os.path.exists(csv_filename):
    csv_filename = f"jobs_{counter}.csv"
    counter += 1

# results wanted and offset
results_wanted = 1000
offset = 0

all_jobs = []

# max retries
max_retries = 3

# nuumber of results at each iteration
results_in_each_iteration = 30

while len(all_jobs) < results_wanted:
    retry_count = 0
    while retry_count < max_retries:
        print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs")
        try:
            jobs = scrape_jobs(
                site_name=["indeed"],
                search_term="software engineer",
                # New York, NY
                # Dallas, TX

                # Los Angeles, CA
                location="Los Angeles, CA",
                results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
                country_indeed="USA",
                offset=offset,
                # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
            )

            # Add the scraped jobs to the list
            all_jobs.extend(jobs.to_dict('records'))

            # Increment the offset for the next page of results
            offset += results_in_each_iteration

            # Add a delay to avoid rate limiting (you can adjust the delay time as needed)
            print(f"Scraped {len(all_jobs)} jobs")
            print("Sleeping secs", 100 * (retry_count + 1))
            time.sleep(100 * (retry_count + 1))  # Sleep for 2 seconds between requests

            break  # Break out of the retry loop if successful
        except Exception as e:
            print(f"Error: {e}")
            retry_count += 1
            print("Sleeping secs before retry", 100 * (retry_count + 1))
            time.sleep(100 * (retry_count + 1))
            if retry_count >= max_retries:
                print("Max retries reached. Exiting.")
                break

# DataFrame from the collected job data
jobs_df = pd.DataFrame(all_jobs)

# Formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", 50)

print(jobs_df)

jobs_df.to_csv(csv_filename, index=False)
print(f"Outputted to {csv_filename}")