add offset param & email extraction (#51)

* add offset param * [enh]: extract emails
2026-03-06 20:44:30 -08:00 · 2023-09-28 18:11:28 -05:00
parent 286b9e1256
commit af07c1ecbd
17 changed files with 1209 additions and 1126 deletions
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@@ -1,8 +1,7 @@
 import pandas as pd
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Tuple, NamedTuple, Dict, Optional
-import traceback
+from typing import List, Tuple, Optional

 from .jobs import JobType, Location
 from .scrapers.indeed import IndeedScraper
@@ -27,17 +26,18 @@ def _map_str_to_site(site_name: str) -> Site:


 def scrape_jobs(
-    site_name: str | List[str] | Site | List[Site],
-    search_term: str,
-    location: str = "",
-    distance: int = None,
-    is_remote: bool = False,
-    job_type: str = None,
-    easy_apply: bool = False,  # linkedin
-    results_wanted: int = 15,
-    country_indeed: str = "usa",
-    hyperlinks: bool = False,
-    proxy: Optional[str] = None,
+        site_name: str | List[str] | Site | List[Site],
+        search_term: str,
+        location: str = "",
+        distance: int = None,
+        is_remote: bool = False,
+        job_type: str = None,
+        easy_apply: bool = False,  # linkedin
+        results_wanted: int = 15,
+        country_indeed: str = "usa",
+        hyperlinks: bool = False,
+        proxy: Optional[str] = None,
+        offset: Optional[int] = 0
 ) -> pd.DataFrame:
    """
    Simultaneously scrapes job data from multiple job sites.
@@ -49,8 +49,8 @@ def scrape_jobs(
            if value_str in job_type.value:
                return job_type
        raise Exception(f"Invalid job type: {value_str}")
-    job_type = get_enum_from_value(job_type) if job_type else None

+    job_type = get_enum_from_value(job_type) if job_type else None

    if type(site_name) == str:
        site_type = [_map_str_to_site(site_name)]
@@ -72,6 +72,7 @@ def scrape_jobs(
        job_type=job_type,
        easy_apply=easy_apply,
        results_wanted=results_wanted,
+        offset=offset
    )

    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
@@ -149,17 +150,19 @@ def scrape_jobs(
    if jobs_dfs:
        jobs_df = pd.concat(jobs_dfs, ignore_index=True)
        desired_order: List[str] = [
+            "job_url_hyper" if hyperlinks else "job_url",
            "site",
            "title",
            "company",
            "location",
-            "date_posted",
            "job_type",
+            "date_posted",
            "interval",
+            "benefits",
            "min_amount",
            "max_amount",
            "currency",
-            "job_url_hyper" if hyperlinks else "job_url",
+            "emails",
            "description",
        ]
        jobs_formatted_df = jobs_df[desired_order]