Salary parse (#163 )

enh: ziprecruiter full description (#162 )
docs: readme
2026-03-05 12:04:33 -08:00 · 2024-06-09 17:45:38 -05:00 · 2024-06-09 16:21:01 -05:00 · 2024-05-29 19:32:32 -05:00 · 2024-05-28 16:04:26 -05:00 · 2024-05-28 16:01:29 -05:00
9 changed files with 191 additions and 67 deletions
--- a/README.md
+++ b/README.md
@@ -13,9 +13,6 @@ work with us.*
 - Aggregates the job postings in a Pandas DataFrame
 - Proxies support
 [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
 Updated for release v1.1.3
 ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
 ### Installation
@@ -41,12 +38,12 @@ jobs = scrape_jobs(
    country_indeed='USA',  # only needed for indeed / glassdoor
    # linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
-    # proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],
+    # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
 )
 print(f"Found {len(jobs)} jobs")
 print(jobs.head())
-jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
+jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
 ```
 ### Output
@@ -79,7 +76,7 @@ Optional
 ├── job_type (str): 
 |    fulltime, parttime, internship, contract
 │
-├── proxies (): 
+├── proxies (list): 
 |    in format ['user:pass@host:port', 'localhost']
 |    each job board will round robin through the proxies
 │
@@ -143,13 +140,14 @@ JobPost
 │   ├── state (str)
 ├── description (str)
 ├── job_type (str): fulltime, parttime, internship, contract
 ├── job_function (str)
 ├── compensation (object)
 │   ├── interval (str): yearly, monthly, weekly, daily, hourly
 │   ├── min_amount (int)
 │   ├── max_amount (int)
 │   └── currency (enum)
-└── date_posted (date)
+├── date_posted (date)
-└── emails (str)
+├── emails (str)
 └── is_remote (bool)
 Indeed specific
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.54"
+version = "1.1.57"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@@ -5,7 +5,7 @@ from typing import Tuple
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from .jobs import JobType, Location
-from .scrapers.utils import logger, set_logger_level
+from .scrapers.utils import logger, set_logger_level, extract_salary
 from .scrapers.indeed import IndeedScraper
 from .scrapers.ziprecruiter import ZipRecruiterScraper
 from .scrapers.glassdoor import GlassdoorScraper
@@ -118,6 +118,21 @@ def scrape_jobs(
            site_value, scraped_data = future.result()
            site_to_jobs_dict[site_value] = scraped_data
    def convert_to_annual(job_data: dict):
        if job_data["interval"] == "hourly":
            job_data["min_amount"] *= 2080
            job_data["max_amount"] *= 2080
        if job_data["interval"] == "monthly":
            job_data["min_amount"] *= 12
            job_data["max_amount"] *= 12
        if job_data["interval"] == "weekly":
            job_data["min_amount"] *= 52
            job_data["max_amount"] *= 52
        if job_data["interval"] == "daily":
            job_data["min_amount"] *= 260
            job_data["max_amount"] *= 260
        job_data["interval"] = "yearly"
    jobs_dfs: list[pd.DataFrame] = []
    for site, job_response in site_to_jobs_dict.items():
@@ -150,11 +165,22 @@ def scrape_jobs(
                job_data["min_amount"] = compensation_obj.get("min_amount")
                job_data["max_amount"] = compensation_obj.get("max_amount")
                job_data["currency"] = compensation_obj.get("currency", "USD")
                if (
                    job_data["interval"]
                    and job_data["interval"] != "yearly"
                    and job_data["min_amount"]
                    and job_data["max_amount"]
                ):
                    convert_to_annual(job_data)
            else:
-                job_data["interval"] = None
+                if country_enum == Country.USA:
-                job_data["min_amount"] = None
+                    (
-                job_data["max_amount"] = None
+                        job_data["interval"],
-                job_data["currency"] = None
+                        job_data["min_amount"],
                        job_data["max_amount"],
                        job_data["currency"],
                    ) = extract_salary(job_data["description"])
            job_df = pd.DataFrame([job_data])
            jobs_dfs.append(job_df)
@@ -182,6 +208,7 @@ def scrape_jobs(
            "max_amount",
            "currency",
            "is_remote",
            "job_function",
            "emails",
            "description",
            "company_url",
--- a/src/jobspy/jobs/init.py
+++ b/src/jobspy/jobs/init.py
@@ -254,6 +254,9 @@ class JobPost(BaseModel):
    logo_photo_url: str | None = None
    banner_photo_url: str | None = None
    # linkedin only atm
    job_function: str | None = None
 class JobResponse(BaseModel):
    jobs: list[JobPost] = []
--- a/src/jobspy/scrapers/glassdoor/init.py
+++ b/src/jobspy/scrapers/glassdoor/init.py
@@ -69,7 +69,7 @@ class GlassdoorScraper(Scraper):
        if location_type is None:
            logger.error("Glassdoor: location not parsed")
            return JobResponse(jobs=[])
-        all_jobs: list[JobPost] = []
+        job_list: list[JobPost] = []
        cursor = None
        range_start = 1 + (scraper_input.offset // self.jobs_per_page)
@@ -81,14 +81,14 @@ class GlassdoorScraper(Scraper):
                jobs, cursor = self._fetch_jobs_page(
                    scraper_input, location_id, location_type, page, cursor
                )
-                all_jobs.extend(jobs)
+                job_list.extend(jobs)
-                if not jobs or len(all_jobs) >= scraper_input.results_wanted:
+                if not jobs or len(job_list) >= scraper_input.results_wanted:
-                    all_jobs = all_jobs[: scraper_input.results_wanted]
+                    job_list = job_list[: scraper_input.results_wanted]
                    break
            except Exception as e:
                logger.error(f"Glassdoor: {str(e)}")
                break
-        return JobResponse(jobs=all_jobs)
+        return JobResponse(jobs=job_list)
    def _fetch_jobs_page(
        self,
--- a/src/jobspy/scrapers/indeed/init.py
+++ b/src/jobspy/scrapers/indeed/init.py
@@ -297,8 +297,8 @@ class IndeedScraper(Scraper):
        max_range = comp["range"].get("max")
        return Compensation(
            interval=interval,
-            min_amount=round(min_range, 2) if min_range is not None else None,
+            min_amount=int(min_range) if min_range is not None else None,
-            max_amount=round(max_range, 2) if max_range is not None else None,
+            max_amount=int(max_range) if max_range is not None else None,
            currency=job["compensation"]["currencyCode"],
        )
--- a/src/jobspy/scrapers/linkedin/init.py
+++ b/src/jobspy/scrapers/linkedin/init.py
@@ -13,14 +13,13 @@ import regex as re
 from typing import Optional
 from datetime import datetime
 from threading import Lock
 from bs4.element import Tag
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse, unquote
 from .. import Scraper, ScraperInput, Site
 from ..exceptions import LinkedInException
-from ..utils import create_session
+from ..utils import create_session, remove_attributes
 from ...jobs import (
    JobPost,
    Location,
@@ -70,9 +69,9 @@ class LinkedInScraper(Scraper):
        """
        self.scraper_input = scraper_input
        job_list: list[JobPost] = []
-        seen_urls = set()
+        seen_ids = set()
-        url_lock = Lock()
+        page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
-        page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
+        request_count = 0
        seconds_old = (
            scraper_input.hours_old * 3600 if scraper_input.hours_old else None
        )
@@ -80,7 +79,8 @@ class LinkedInScraper(Scraper):
            lambda: len(job_list) < scraper_input.results_wanted and page < 1000
        )
        while continue_search():
-            logger.info(f"LinkedIn search page: {page // 25 + 1}")
+            request_count += 1
            logger.info(f"LinkedIn search page: {request_count}")
            params = {
                "keywords": scraper_input.search_term,
                "location": scraper_input.location,
@@ -92,7 +92,7 @@ class LinkedInScraper(Scraper):
                    else None
                ),
                "pageNum": 0,
-                "start": page + scraper_input.offset,
+                "start": page,
                "f_AL": "true" if scraper_input.easy_apply else None,
                "f_C": (
                    ",".join(map(str, scraper_input.linkedin_company_ids))
@@ -133,36 +133,34 @@ class LinkedInScraper(Scraper):
                return JobResponse(jobs=job_list)
            for job_card in job_cards:
                job_url = None
                href_tag = job_card.find("a", class_="base-card__full-link")
                if href_tag and "href" in href_tag.attrs:
                    href = href_tag.attrs["href"].split("?")[0]
                    job_id = href.split("-")[-1]
                    job_url = f"{self.base_url}/jobs/view/{job_id}"
-                with url_lock:
+                    if job_id in seen_ids:
                    if job_url in seen_urls:
                        continue
-                    seen_urls.add(job_url)
+                    seen_ids.add(job_id)
-                try:
+
-                    fetch_desc = scraper_input.linkedin_fetch_description
+                    try:
-                    job_post = self._process_job(job_card, job_url, fetch_desc)
+                        fetch_desc = scraper_input.linkedin_fetch_description
-                    if job_post:
+                        job_post = self._process_job(job_card, job_id, fetch_desc)
-                        job_list.append(job_post)
+                        if job_post:
-                    if not continue_search():
+                            job_list.append(job_post)
-                        break
+                        if not continue_search():
-                except Exception as e:
+                            break
-                    raise LinkedInException(str(e))
+                    except Exception as e:
                        raise LinkedInException(str(e))
            if continue_search():
                time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
-                page += self.jobs_per_page
+                page += len(job_list)
        job_list = job_list[: scraper_input.results_wanted]
        return JobResponse(jobs=job_list)
    def _process_job(
-        self, job_card: Tag, job_url: str, full_descr: bool
+        self, job_card: Tag, job_id: str, full_descr: bool
    ) -> Optional[JobPost]:
        salary_tag = job_card.find("span", class_="job-search-card__salary-info")
@@ -209,46 +207,39 @@ class LinkedInScraper(Scraper):
                date_posted = None
        job_details = {}
        if full_descr:
-            job_details = self._get_job_details(job_url)
+            job_details = self._get_job_details(job_id)
        return JobPost(
-            id=self._get_id(job_url),
+            id=job_id,
            title=title,
            company_name=company,
            company_url=company_url,
            location=location,
            date_posted=date_posted,
-            job_url=job_url,
+            job_url=f"{self.base_url}/jobs/view/{job_id}",
            compensation=compensation,
            job_type=job_details.get("job_type"),
            description=job_details.get("description"),
            job_url_direct=job_details.get("job_url_direct"),
            emails=extract_emails_from_text(job_details.get("description")),
            logo_photo_url=job_details.get("logo_photo_url"),
            job_function=job_details.get("job_function"),
        )
-    def _get_id(self, url: str):
+    def _get_job_details(self, job_id: str) -> dict:
        """
        Extracts the job id from the job url
        :param url:
        :return: str
        """
        if not url:
            return None
        return url.split("/")[-1]
    def _get_job_details(self, job_page_url: str) -> dict:
        """
        Retrieves job description and other job details by going to the job page url
        :param job_page_url:
        :return: dict
        """
        try:
-            response = self.session.get(job_page_url, timeout=5)
+            response = self.session.get(
                f"{self.base_url}/jobs-guest/jobs/api/jobPosting/{job_id}", timeout=5
            )
            response.raise_for_status()
        except:
            return {}
-        if response.url == "https://www.linkedin.com/signup":
+        if "linkedin.com/signup" in response.url:
            return {}
        soup = BeautifulSoup(response.text, "html.parser")
@@ -257,16 +248,22 @@ class LinkedInScraper(Scraper):
        )
        description = None
        if div_content is not None:
            def remove_attributes(tag):
                for attr in list(tag.attrs):
                    del tag[attr]
                return tag
            div_content = remove_attributes(div_content)
            description = div_content.prettify(formatter="html")
            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                description = markdown_converter(description)
        h3_tag = soup.find(
            "h3", text=lambda text: text and "Job function" in text.strip()
        )
        job_function = None
        if h3_tag:
            job_function_span = h3_tag.find_next(
                "span", class_="description__job-criteria-text"
            )
            if job_function_span:
                job_function = job_function_span.text.strip()
        return {
            "description": description,
            "job_type": self._parse_job_type(soup),
@@ -274,6 +271,7 @@ class LinkedInScraper(Scraper):
            "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
                "data-delayed-url"
            ),
            "job_function": job_function,
        }
    def _get_location(self, metadata_card: Optional[Tag]) -> Location:
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -93,6 +93,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
            else:
                self.proxies = {}
        response = tls_client.Session.execute_request(self, *args, **kwargs)
        response.ok = response.status_code in range(200, 400)
        return response
@@ -178,3 +179,61 @@ def currency_parser(cur_str):
        num = float(cur_str)
    return np.round(num, 2)
 def remove_attributes(tag):
    for attr in list(tag.attrs):
        del tag[attr]
    return tag
 def extract_salary(
    salary_str,
    lower_limit=1000,
    upper_limit=700000,
    hourly_threshold=350,
    monthly_threshold=30000,
 ):
    if not salary_str:
        return None, None, None, None
    min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
    def to_int(s):
        return int(float(s.replace(",", "")))
    def convert_hourly_to_annual(hourly_wage):
        return hourly_wage * 2080
    def convert_monthly_to_annual(monthly_wage):
        return monthly_wage * 12
    match = re.search(min_max_pattern, salary_str)
    if match:
        min_salary = to_int(match.group(1))
        max_salary = to_int(match.group(3))
        # Handle 'k' suffix for min and max salaries independently
        if "k" in match.group(2).lower() or "k" in match.group(4).lower():
            min_salary *= 1000
            max_salary *= 1000
        # Convert to annual if less than the hourly threshold
        if min_salary < hourly_threshold:
            min_salary = convert_hourly_to_annual(min_salary)
            if max_salary < hourly_threshold:
                max_salary = convert_hourly_to_annual(max_salary)
        elif min_salary < monthly_threshold:
            min_salary = convert_monthly_to_annual(min_salary)
            if max_salary < monthly_threshold:
                max_salary = convert_monthly_to_annual(max_salary)
        # Ensure salary range is within specified limits
        if (
            lower_limit <= min_salary <= upper_limit
            and lower_limit <= max_salary <= upper_limit
            and min_salary < max_salary
        ):
            return "yearly", min_salary, max_salary, "USD"
    return None, None, None, None
--- a/src/jobspy/scrapers/ziprecruiter/init.py
+++ b/src/jobspy/scrapers/ziprecruiter/init.py
@@ -7,19 +7,24 @@ This module contains routines to scrape ZipRecruiter.
 from __future__ import annotations
 import json
 import math
 import re
 import time
 from datetime import datetime
 from typing import Optional, Tuple, Any
 from concurrent.futures import ThreadPoolExecutor
 from bs4 import BeautifulSoup
 from .. import Scraper, ScraperInput, Site
 from ..utils import (
    logger,
    extract_emails_from_text,
    create_session,
    markdown_converter,
    remove_attributes,
 )
 from ...jobs import (
    JobPost,
@@ -151,6 +156,8 @@ class ZipRecruiterScraper(Scraper):
        comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
        comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
        comp_currency = job.get("compensation_currency")
        description_full, job_url_direct = self._get_descr(job_url)
        return JobPost(
            id=str(job["listing_key"]),
            title=title,
@@ -165,10 +172,42 @@ class ZipRecruiterScraper(Scraper):
            ),
            date_posted=date_posted,
            job_url=job_url,
-            description=description,
+            description=description_full if description_full else description,
            emails=extract_emails_from_text(description) if description else None,
            job_url_direct=job_url_direct,
        )
    def _get_descr(self, job_url):
        res = self.session.get(job_url, headers=self.headers, allow_redirects=True)
        description_full = job_url_direct = None
        if res.ok:
            soup = BeautifulSoup(res.text, "html.parser")
            job_descr_div = soup.find("div", class_="job_description")
            company_descr_section = soup.find("section", class_="company_description")
            job_description_clean = (
                remove_attributes(job_descr_div).prettify(formatter="html")
                if job_descr_div
                else ""
            )
            company_description_clean = (
                remove_attributes(company_descr_section).prettify(formatter="html")
                if company_descr_section
                else ""
            )
            description_full = job_description_clean + company_description_clean
            script_tag = soup.find("script", type="application/json")
            if script_tag:
                job_json = json.loads(script_tag.string)
                job_url_val = job_json["model"]["saveJobURL"]
                m = re.search(r"job_url=(.+)", job_url_val)
                if m:
                    job_url_direct = m.group(1)
            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                description_full = markdown_converter(description_full)
        return description_full, job_url_direct
    def _get_cookies(self):
        data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
        url = f"{self.api_url}/jobs-app/event"
Author	SHA1	Message	Date
Cullen Watson	d000a81eb3	Salary parse (#163 )	2024-06-09 17:45:38 -05:00
Cullen Watson	ccb0c17660	enh: ziprecruiter full description (#162 )	2024-06-09 16:21:01 -05:00
Cullen Watson	df339610fa	docs: readme	2024-05-29 19:32:32 -05:00
Cullen Watson	c501006bd8	docs: readme	2024-05-28 16:04:26 -05:00
Cullen Watson	89a3ee231c	enh(li): job function (#160 )	2024-05-28 16:01:29 -05:00
Cullen	6439f71433	chore: version	2024-05-28 15:39:24 -05:00
adamagassi	7f6271b2e0	LinkedIn scraper fixes: (#159 ) Correct initial page offset calculation Separate page variable from request counter Fix job offset starting value Increment offset by number of jobs returned instead of expected value	2024-05-28 15:38:13 -05:00