import concurrent.futures import time from jobspy import scrape_jobs import pandas as pd def scrape_and_append_threaded(offset, results_in_segment, all_jobs): try: print(f"Scraping jobs from offset {offset}") jobs = scrape_jobs( site_name=["indeed"], search_term="software engineer", location="Los Angeles, CA", results_wanted=results_in_segment, country_indeed="USA", offset=offset, ) all_jobs.extend(jobs.to_dict("records")) print(f"Scraped {len(jobs)} jobs from offset {offset}") except Exception as e: print(f"Error at offset {offset}: {e}") def main(): results_wanted = 500 results_in_each_iteration = 100 offset = 0 all_jobs = [] segments = results_wanted//results_in_each_iteration with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for i in range(segments): offset = i * results_in_each_iteration futures.append(executor.submit(scrape_and_append_threaded, offset, results_in_each_iteration, all_jobs)) for future in concurrent.futures.as_completed(futures): error = future.exception() if error: print(f"Error: {error}") # delay to avoid rate limiting time.sleep(2) # DataFrame from the collected job data jobs_df = pd.DataFrame(all_jobs) # Formatting pd.set_option("display.max_columns", None) pd.set_option("display.max_rows", None) pd.set_option("display.width", None) pd.set_option("display.max_colwidth", 50) print(jobs_df) # Output to CSV csv_filename = "jobs.csv" jobs_df.to_csv(csv_filename, index=False) print(f"Outputted to {csv_filename}") if __name__ == "__main__": main()