added new example

2024-05-03 18:22:53 -04:00 · 2024-05-03 18:22:53 -04:00 · 0c7e7841e2
parent 1ffdb1756f
commit 0c7e7841e2
1 changed files with 62 additions and 0 deletions
--- a/examples/JobSpy_LongScrape_Threaded.py
+++ b/examples/JobSpy_LongScrape_Threaded.py
@ -0,0 +1,62 @@
+import concurrent.futures
+import time
+from jobspy import scrape_jobs
+import pandas as pd
+
+def scrape_and_append_threaded(offset, results_in_segment, all_jobs):
+    try:
+        print(f"Scraping jobs from offset {offset}")
+        jobs = scrape_jobs(
+            site_name=["indeed"],
+            search_term="software engineer",
+            location="Los Angeles, CA",
+            results_wanted=results_in_segment,
+            country_indeed="USA",
+            offset=offset,
+        )
+
+        all_jobs.extend(jobs.to_dict("records"))
+        print(f"Scraped {len(jobs)} jobs from offset {offset}")
+
+    except Exception as e:
+        print(f"Error at offset {offset}: {e}")
+
+def main():
+    results_wanted = 500
+    results_in_each_iteration = 100
+    offset = 0
+    all_jobs = []
+    segments = results_wanted//results_in_each_iteration
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = []
+        for i in range(segments):
+            offset = i * results_in_each_iteration
+            futures.append(executor.submit(scrape_and_append_threaded, offset, results_in_each_iteration, all_jobs))
+
+        for future in concurrent.futures.as_completed(futures):
+            error = future.exception()
+            if error:
+                print(f"Error: {error}")
+
+        # delay to avoid rate limiting
+        time.sleep(2)
+
+    # DataFrame from the collected job data
+    jobs_df = pd.DataFrame(all_jobs)
+
+    # Formatting
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.max_rows", None)
+    pd.set_option("display.width", None)
+    pd.set_option("display.max_colwidth", 50)
+
+    print(jobs_df)
+
+    # Output to CSV
+    csv_filename = "jobs.csv"
+    jobs_df.to_csv(csv_filename, index=False)
+    print(f"Outputted to {csv_filename}")
+
+if __name__ == "__main__":
+    main()