enh: ziprecruiter full description (#162 )

docs: readme
2026-03-04 19:44:30 -08:00 · 2024-06-09 16:21:01 -05:00 · 2024-05-29 19:32:32 -05:00 · 2024-05-28 16:04:26 -05:00 · 2024-05-28 16:01:29 -05:00 · 2024-05-28 15:39:24 -05:00
7 changed files with 83 additions and 28 deletions
--- a/README.md
+++ b/README.md
@@ -13,9 +13,6 @@ work with us.*
 - Aggregates the job postings in a Pandas DataFrame
 - Proxies support

-[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
-Updated for release v1.1.3
-
 ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)

 ### Installation
@@ -41,12 +38,12 @@ jobs = scrape_jobs(
    country_indeed='USA',  # only needed for indeed / glassdoor
    
    # linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
-    # proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],
+    # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
    
 )
 print(f"Found {len(jobs)} jobs")
 print(jobs.head())
-jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
+jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
 ```

 ### Output
@@ -79,7 +76,7 @@ Optional
 ├── job_type (str): 
 |    fulltime, parttime, internship, contract
 │
-├── proxies (): 
+├── proxies (list): 
 |    in format ['user:pass@host:port', 'localhost']
 |    each job board will round robin through the proxies
 │
@@ -143,13 +140,14 @@ JobPost
 │   ├── state (str)
 ├── description (str)
 ├── job_type (str): fulltime, parttime, internship, contract
+├── job_function (str)
 ├── compensation (object)
 │   ├── interval (str): yearly, monthly, weekly, daily, hourly
 │   ├── min_amount (int)
 │   ├── max_amount (int)
 │   └── currency (enum)
-└── date_posted (date)
-└── emails (str)
+├── date_posted (date)
+├── emails (str)
 └── is_remote (bool)

 Indeed specific
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.54"
+version = "1.1.56"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@@ -182,6 +182,7 @@ def scrape_jobs(
            "max_amount",
            "currency",
            "is_remote",
+            "job_function",
            "emails",
            "description",
            "company_url",
--- a/src/jobspy/jobs/init.py
+++ b/src/jobspy/jobs/init.py
@@ -254,6 +254,9 @@ class JobPost(BaseModel):
    logo_photo_url: str | None = None
    banner_photo_url: str | None = None

+    # linkedin only atm
+    job_function: str | None = None
+

 class JobResponse(BaseModel):
    jobs: list[JobPost] = []
--- a/src/jobspy/scrapers/linkedin/init.py
+++ b/src/jobspy/scrapers/linkedin/init.py
@@ -13,14 +13,13 @@ import regex as re
 from typing import Optional
 from datetime import datetime

-from threading import Lock
 from bs4.element import Tag
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse, unquote

 from .. import Scraper, ScraperInput, Site
 from ..exceptions import LinkedInException
-from ..utils import create_session
+from ..utils import create_session, remove_attributes
 from ...jobs import (
    JobPost,
    Location,
@@ -71,8 +70,8 @@ class LinkedInScraper(Scraper):
        self.scraper_input = scraper_input
        job_list: list[JobPost] = []
        seen_urls = set()
-        url_lock = Lock()
-        page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
+        page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
+        request_count = 0
        seconds_old = (
            scraper_input.hours_old * 3600 if scraper_input.hours_old else None
        )
@@ -80,7 +79,8 @@ class LinkedInScraper(Scraper):
            lambda: len(job_list) < scraper_input.results_wanted and page < 1000
        )
        while continue_search():
-            logger.info(f"LinkedIn search page: {page // 25 + 1}")
+            request_count += 1
+            logger.info(f"LinkedIn search page: {request_count}")
            params = {
                "keywords": scraper_input.search_term,
                "location": scraper_input.location,
@@ -92,7 +92,7 @@ class LinkedInScraper(Scraper):
                    else None
                ),
                "pageNum": 0,
-                "start": page + scraper_input.offset,
+                "start": page,
                "f_AL": "true" if scraper_input.easy_apply else None,
                "f_C": (
                    ",".join(map(str, scraper_input.linkedin_company_ids))
@@ -140,10 +140,9 @@ class LinkedInScraper(Scraper):
                    job_id = href.split("-")[-1]
                    job_url = f"{self.base_url}/jobs/view/{job_id}"

-                with url_lock:
-                    if job_url in seen_urls:
-                        continue
-                    seen_urls.add(job_url)
+                if job_url in seen_urls:
+                    continue
+                seen_urls.add(job_url)
                try:
                    fetch_desc = scraper_input.linkedin_fetch_description
                    job_post = self._process_job(job_card, job_url, fetch_desc)
@@ -156,7 +155,7 @@ class LinkedInScraper(Scraper):

            if continue_search():
                time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
-                page += self.jobs_per_page
+                page += len(job_list)

        job_list = job_list[: scraper_input.results_wanted]
        return JobResponse(jobs=job_list)
@@ -225,6 +224,7 @@ class LinkedInScraper(Scraper):
            job_url_direct=job_details.get("job_url_direct"),
            emails=extract_emails_from_text(job_details.get("description")),
            logo_photo_url=job_details.get("logo_photo_url"),
+            job_function=job_details.get("job_function"),
        )

    def _get_id(self, url: str):
@@ -248,7 +248,7 @@ class LinkedInScraper(Scraper):
            response.raise_for_status()
        except:
            return {}
-        if response.url == "https://www.linkedin.com/signup":
+        if "linkedin.com/signup" in response.url:
            return {}

        soup = BeautifulSoup(response.text, "html.parser")
@@ -257,16 +257,22 @@ class LinkedInScraper(Scraper):
        )
        description = None
        if div_content is not None:
-
-            def remove_attributes(tag):
-                for attr in list(tag.attrs):
-                    del tag[attr]
-                return tag
-
            div_content = remove_attributes(div_content)
            description = div_content.prettify(formatter="html")
            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                description = markdown_converter(description)
+
+        h3_tag = soup.find(
+            "h3", text=lambda text: text and "Job function" in text.strip()
+        )
+
+        job_function = None
+        if h3_tag:
+            job_function_span = h3_tag.find_next(
+                "span", class_="description__job-criteria-text"
+            )
+            if job_function_span:
+                job_function = job_function_span.text.strip()
        return {
            "description": description,
            "job_type": self._parse_job_type(soup),
@@ -274,6 +280,7 @@ class LinkedInScraper(Scraper):
            "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
                "data-delayed-url"
            ),
+            "job_function": job_function,
        }

    def _get_location(self, metadata_card: Optional[Tag]) -> Location:
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -93,6 +93,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
            else:
                self.proxies = {}
        response = tls_client.Session.execute_request(self, *args, **kwargs)
+        response.ok = response.status_code in range(200, 400)
        return response


@@ -178,3 +179,9 @@ def currency_parser(cur_str):
        num = float(cur_str)

    return np.round(num, 2)
+
+
+def remove_attributes(tag):
+    for attr in list(tag.attrs):
+        del tag[attr]
+    return tag
--- a/src/jobspy/scrapers/ziprecruiter/init.py
+++ b/src/jobspy/scrapers/ziprecruiter/init.py
@@ -7,19 +7,24 @@ This module contains routines to scrape ZipRecruiter.

 from __future__ import annotations

+import json
 import math
+import re
 import time
 from datetime import datetime
 from typing import Optional, Tuple, Any

 from concurrent.futures import ThreadPoolExecutor

+from bs4 import BeautifulSoup
+
 from .. import Scraper, ScraperInput, Site
 from ..utils import (
    logger,
    extract_emails_from_text,
    create_session,
    markdown_converter,
+    remove_attributes,
 )
 from ...jobs import (
    JobPost,
@@ -151,6 +156,8 @@ class ZipRecruiterScraper(Scraper):
        comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
        comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
        comp_currency = job.get("compensation_currency")
+        description_full, job_url_direct = self._get_descr(job_url)
+
        return JobPost(
            id=str(job["listing_key"]),
            title=title,
@@ -165,10 +172,42 @@ class ZipRecruiterScraper(Scraper):
            ),
            date_posted=date_posted,
            job_url=job_url,
-            description=description,
+            description=description_full if description_full else description,
            emails=extract_emails_from_text(description) if description else None,
+            job_url_direct=job_url_direct,
        )

+    def _get_descr(self, job_url):
+        res = self.session.get(job_url, headers=self.headers, allow_redirects=True)
+        description_full = job_url_direct = None
+        if res.ok:
+            soup = BeautifulSoup(res.text, "html.parser")
+            job_descr_div = soup.find("div", class_="job_description")
+            company_descr_section = soup.find("section", class_="company_description")
+            job_description_clean = (
+                remove_attributes(job_descr_div).prettify(formatter="html")
+                if job_descr_div
+                else ""
+            )
+            company_description_clean = (
+                remove_attributes(company_descr_section).prettify(formatter="html")
+                if company_descr_section
+                else ""
+            )
+            description_full = job_description_clean + company_description_clean
+            script_tag = soup.find("script", type="application/json")
+            if script_tag:
+                job_json = json.loads(script_tag.string)
+                job_url_val = job_json["model"]["saveJobURL"]
+                m = re.search(r"job_url=(.+)", job_url_val)
+                if m:
+                    job_url_direct = m.group(1)
+
+            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
+                description_full = markdown_converter(description_full)
+
+        return description_full, job_url_direct
+
    def _get_cookies(self):
        data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
        url = f"{self.api_url}/jobs-app/event"
Author	SHA1	Message	Date
Cullen Watson	ccb0c17660	enh: ziprecruiter full description (#162 )	2024-06-09 16:21:01 -05:00
Cullen Watson	df339610fa	docs: readme	2024-05-29 19:32:32 -05:00
Cullen Watson	c501006bd8	docs: readme	2024-05-28 16:04:26 -05:00
Cullen Watson	89a3ee231c	enh(li): job function (#160 )	2024-05-28 16:01:29 -05:00
Cullen	6439f71433	chore: version	2024-05-28 15:39:24 -05:00
adamagassi	7f6271b2e0	LinkedIn scraper fixes: (#159 ) Correct initial page offset calculation Separate page variable from request counter Fix job offset starting value Increment offset by number of jobs returned instead of expected value	2024-05-28 15:38:13 -05:00