mirror of https://github.com/Bunsly/JobSpy
added new example
parent
1ffdb1756f
commit
0c7e7841e2
|
@ -0,0 +1,62 @@
|
||||||
|
import concurrent.futures
|
||||||
|
import time
|
||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def scrape_and_append_threaded(offset, results_in_segment, all_jobs):
|
||||||
|
try:
|
||||||
|
print(f"Scraping jobs from offset {offset}")
|
||||||
|
jobs = scrape_jobs(
|
||||||
|
site_name=["indeed"],
|
||||||
|
search_term="software engineer",
|
||||||
|
location="Los Angeles, CA",
|
||||||
|
results_wanted=results_in_segment,
|
||||||
|
country_indeed="USA",
|
||||||
|
offset=offset,
|
||||||
|
)
|
||||||
|
|
||||||
|
all_jobs.extend(jobs.to_dict("records"))
|
||||||
|
print(f"Scraped {len(jobs)} jobs from offset {offset}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error at offset {offset}: {e}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
results_wanted = 500
|
||||||
|
results_in_each_iteration = 100
|
||||||
|
offset = 0
|
||||||
|
all_jobs = []
|
||||||
|
segments = results_wanted//results_in_each_iteration
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
futures = []
|
||||||
|
for i in range(segments):
|
||||||
|
offset = i * results_in_each_iteration
|
||||||
|
futures.append(executor.submit(scrape_and_append_threaded, offset, results_in_each_iteration, all_jobs))
|
||||||
|
|
||||||
|
for future in concurrent.futures.as_completed(futures):
|
||||||
|
error = future.exception()
|
||||||
|
if error:
|
||||||
|
print(f"Error: {error}")
|
||||||
|
|
||||||
|
# delay to avoid rate limiting
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# DataFrame from the collected job data
|
||||||
|
jobs_df = pd.DataFrame(all_jobs)
|
||||||
|
|
||||||
|
# Formatting
|
||||||
|
pd.set_option("display.max_columns", None)
|
||||||
|
pd.set_option("display.max_rows", None)
|
||||||
|
pd.set_option("display.width", None)
|
||||||
|
pd.set_option("display.max_colwidth", 50)
|
||||||
|
|
||||||
|
print(jobs_df)
|
||||||
|
|
||||||
|
# Output to CSV
|
||||||
|
csv_filename = "jobs.csv"
|
||||||
|
jobs_df.to_csv(csv_filename, index=False)
|
||||||
|
print(f"Outputted to {csv_filename}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue