diff --git a/examples/JobSpy_LongScrape_Threaded.py b/examples/JobSpy_LongScrape_Threaded.py new file mode 100644 index 0000000..59bade7 --- /dev/null +++ b/examples/JobSpy_LongScrape_Threaded.py @@ -0,0 +1,62 @@ +import concurrent.futures +import time +from jobspy import scrape_jobs +import pandas as pd + +def scrape_and_append_threaded(offset, results_in_segment, all_jobs): + try: + print(f"Scraping jobs from offset {offset}") + jobs = scrape_jobs( + site_name=["indeed"], + search_term="software engineer", + location="Los Angeles, CA", + results_wanted=results_in_segment, + country_indeed="USA", + offset=offset, + ) + + all_jobs.extend(jobs.to_dict("records")) + print(f"Scraped {len(jobs)} jobs from offset {offset}") + + except Exception as e: + print(f"Error at offset {offset}: {e}") + +def main(): + results_wanted = 500 + results_in_each_iteration = 100 + offset = 0 + all_jobs = [] + segments = results_wanted//results_in_each_iteration + + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for i in range(segments): + offset = i * results_in_each_iteration + futures.append(executor.submit(scrape_and_append_threaded, offset, results_in_each_iteration, all_jobs)) + + for future in concurrent.futures.as_completed(futures): + error = future.exception() + if error: + print(f"Error: {error}") + + # delay to avoid rate limiting + time.sleep(2) + + # DataFrame from the collected job data + jobs_df = pd.DataFrame(all_jobs) + + # Formatting + pd.set_option("display.max_columns", None) + pd.set_option("display.max_rows", None) + pd.set_option("display.width", None) + pd.set_option("display.max_colwidth", 50) + + print(jobs_df) + + # Output to CSV + csv_filename = "jobs.csv" + jobs_df.to_csv(csv_filename, index=False) + print(f"Outputted to {csv_filename}") + +if __name__ == "__main__": + main()