From a2c8fe046e03ec4e854dc1fc070bfb09a5d2125d Mon Sep 17 00:00:00 2001
From: Zachary Hampton <zachary@zacharysproducts.com>
Date: Mon, 6 Nov 2023 22:13:19 -0700
Subject: [PATCH 1/4] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4238921..73e60af 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 **Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
 
-*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to
+*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
 work with us.*  
 
 Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** – a Python package

From cc9e7866b7db273d527809b98f691bd46f6beb03 Mon Sep 17 00:00:00 2001
From: Cullen Watson <cullen@bunsly.com>
Date: Wed, 8 Nov 2023 15:51:07 -0600
Subject: [PATCH 2/4] fix linkedin bug & add linkedin company url (#67)

---
 README.md                                | 10 ++--
 pyproject.toml                           |  2 +-
 src/jobspy/__init__.py                   |  1 +
 src/jobspy/jobs/__init__.py              |  2 +
 src/jobspy/scrapers/linkedin/__init__.py | 65 +++++++++++++-----------
 5 files changed, 46 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 73e60af..6de1dc6 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ zip_recruiter Software Developer                 TEKsystems        Phoenix
 
 ```plaintext
 Required
-├── site_type (List[enum]): linkedin, zip_recruiter, indeed
+├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
 └── search_term (str)
 Optional
 ├── location (int)
@@ -107,21 +107,22 @@ The following exceptions may be raised when using JobSpy:
 * `LinkedInException`
 * `IndeedException`
 * `ZipRecruiterException`
+* `GlassdoorException`
 
 ## Supported Countries for Job Searching
 
 ### **LinkedIn**
 
-LinkedIn searches globally & uses only the `location` parameter.
+LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
 
 ### **ZipRecruiter**
 
 ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
 
-### **Indeed**
+### **Indeed / Glassdoor**
 
 Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
-parameter to narrow down the location, e.g. city & state if necessary.
+parameter to narrow down the location, e.g. city & state if necessary. 
 
 You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor):
 
@@ -145,6 +146,7 @@ You can specify the following countries when searching on Indeed (use the exact
 | Venezuela            | Vietnam      |            |                |
 
 
+Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
 ## Frequently Asked Questions
 
 ---
diff --git a/pyproject.toml b/pyproject.toml
index e5c4a14..ed7c8f9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.25"
+version = "1.1.26"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
index 7c5fa64..8214a1f 100644
--- a/src/jobspy/__init__.py
+++ b/src/jobspy/__init__.py
@@ -163,6 +163,7 @@ def scrape_jobs(
             "site",
             "title",
             "company",
+            "company_url",
             "location",
             "job_type",
             "date_posted",
diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py
index 0737824..f1fd708 100644
--- a/src/jobspy/jobs/__init__.py
+++ b/src/jobspy/jobs/__init__.py
@@ -196,6 +196,8 @@ class JobPost(BaseModel):
     location: Optional[Location]
 
     description: str | None = None
+    company_url: str | None = None
+
     job_type: list[JobType] | None = None
     compensation: Compensation | None = None
     date_posted: date | None = None
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
index 26d4390..922e671 100644
--- a/src/jobspy/scrapers/linkedin/__init__.py
+++ b/src/jobspy/scrapers/linkedin/__init__.py
@@ -10,10 +10,10 @@ from datetime import datetime
 import requests
 import time
 from requests.exceptions import ProxyError
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from threading import Lock
+from urllib.parse import urlparse, urlunparse
 
 from .. import Scraper, ScraperInput, Site
 from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
@@ -66,12 +66,10 @@ class LinkedInScraper(Scraper):
                 if scraper_input.job_type
                 else None,
                 "pageNum": 0,
-                page: page + scraper_input.offset,
+                "start": page + scraper_input.offset,
                 "f_AL": "true" if scraper_input.easy_apply else None,
             }
 
-            params = {k: v for k, v in params.items() if v is not None}
-
             params = {k: v for k, v in params.items() if v is not None}
             retries = 0
             while retries < self.MAX_RETRIES:
@@ -88,7 +86,7 @@ class LinkedInScraper(Scraper):
                     break
                 except requests.HTTPError as e:
                     if hasattr(e, "response") and e.response is not None:
-                        if e.response.status_code == 429:
+                        if e.response.status_code in (429, 502):
                             time.sleep(self.DELAY)
                             retries += 1
                             continue
@@ -110,32 +108,27 @@ class LinkedInScraper(Scraper):
 
             soup = BeautifulSoup(response.text, "html.parser")
 
-            with ThreadPoolExecutor(max_workers=5) as executor:
-                futures = []
-                for job_card in soup.find_all("div", class_="base-search-card"):
-                    job_url = None
-                    href_tag = job_card.find("a", class_="base-card__full-link")
-                    if href_tag and "href" in href_tag.attrs:
-                        href = href_tag.attrs["href"].split("?")[0]
-                        job_id = href.split("-")[-1]
-                        job_url = f"{self.url}/jobs/view/{job_id}"
+            for job_card in soup.find_all("div", class_="base-search-card"):
+                job_url = None
+                href_tag = job_card.find("a", class_="base-card__full-link")
+                if href_tag and "href" in href_tag.attrs:
+                    href = href_tag.attrs["href"].split("?")[0]
+                    job_id = href.split("-")[-1]
+                    job_url = f"{self.url}/jobs/view/{job_id}"
 
-                    with url_lock:
-                        if job_url in seen_urls:
-                            continue
-                        seen_urls.add(job_url)
+                with url_lock:
+                    if job_url in seen_urls:
+                        continue
+                    seen_urls.add(job_url)
 
-                    futures.append(executor.submit(self.process_job, job_card, job_url))
+                # Call process_job directly without threading
+                try:
+                    job_post = self.process_job(job_card, job_url)
+                    if job_post:
+                        job_list.append(job_post)
+                except Exception as e:
+                    raise LinkedInException("Exception occurred while processing jobs")
 
-                for future in as_completed(futures):
-                    try:
-                        job_post = future.result()
-                        if job_post:
-                            job_list.append(job_post)
-                    except Exception as e:
-                        raise LinkedInException(
-                            "Exception occurred while processing jobs"
-                        )
             page += 25
 
         job_list = job_list[: scraper_input.results_wanted]
@@ -147,6 +140,11 @@ class LinkedInScraper(Scraper):
 
         company_tag = job_card.find("h4", class_="base-search-card__subtitle")
         company_a_tag = company_tag.find("a") if company_tag else None
+        company_url = (
+            urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
+            if company_a_tag and company_a_tag.has_attr("href")
+            else ""
+        )
         company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
 
         metadata_card = job_card.find("div", class_="base-search-card__metadata")
@@ -168,11 +166,13 @@ class LinkedInScraper(Scraper):
         benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
 
         description, job_type = self.get_job_description(job_url)
+        # description, job_type = None, []
 
         return JobPost(
             title=title,
             description=description,
             company_name=company,
+            company_url=company_url,
             location=location,
             date_posted=date_posted,
             job_url=job_url,
@@ -193,8 +193,15 @@ class LinkedInScraper(Scraper):
         try:
             response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
             response.raise_for_status()
+        except requests.HTTPError as e:
+            if hasattr(e, "response") and e.response is not None:
+                if e.response.status_code in (429, 502):
+                    time.sleep(self.DELAY)
+            return None, None
         except Exception as e:
             return None, None
+        if response.url == "https://www.linkedin.com/signup":
+            return None, None
 
         soup = BeautifulSoup(response.text, "html.parser")
         div_content = soup.find(
@@ -230,7 +237,7 @@ class LinkedInScraper(Scraper):
                     employment_type = employment_type.lower()
                     employment_type = employment_type.replace("-", "")
 
-            return [get_enum_from_job_type(employment_type)]
+            return [get_enum_from_job_type(employment_type)] if employment_type else []
 
         return description, get_job_type(soup)
 

From 81f70ff8a577c2da1eb53a13361420146f30df04 Mon Sep 17 00:00:00 2001
From: Faraz Khan <frzk410@gmail.com>
Date: Fri, 10 Nov 2023 01:57:15 +0500
Subject: [PATCH 3/4] added salary data for linkedin (#68)

---
 pyproject.toml                           |  2 +-
 src/jobspy/scrapers/linkedin/__init__.py | 21 +++++++++++++++++++--
 src/jobspy/scrapers/utils.py             | 17 +++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ed7c8f9..12a694d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.26"
+version = "1.1.27"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
index 922e671..5fcc696 100644
--- a/src/jobspy/scrapers/linkedin/__init__.py
+++ b/src/jobspy/scrapers/linkedin/__init__.py
@@ -16,9 +16,9 @@ from threading import Lock
 from urllib.parse import urlparse, urlunparse
 
 from .. import Scraper, ScraperInput, Site
-from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
+from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
 from ..exceptions import LinkedInException
-from ...jobs import JobPost, Location, JobResponse, JobType, Country
+from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
 
 
 class LinkedInScraper(Scraper):
@@ -135,6 +135,22 @@ class LinkedInScraper(Scraper):
         return JobResponse(jobs=job_list)
 
     def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
+        salary_tag = job_card.find('span', class_='job-search-card__salary-info')
+
+        compensation = None
+        if salary_tag:
+            salary_text = salary_tag.get_text(separator=' ').strip()
+            salary_values = [currency_parser(value) for value in salary_text.split('-')]
+            salary_min = salary_values[0]
+            salary_max = salary_values[1]
+            currency = salary_text[0] if salary_text[0] != '$' else 'USD'
+
+            compensation = Compensation(
+                min_amount=int(salary_min),
+                max_amount=int(salary_max),
+                currency=currency,
+            )
+
         title_tag = job_card.find("span", class_="sr-only")
         title = title_tag.get_text(strip=True) if title_tag else "N/A"
 
@@ -177,6 +193,7 @@ class LinkedInScraper(Scraper):
             date_posted=date_posted,
             job_url=job_url,
             job_type=job_type,
+            compensation=compensation,
             benefits=benefits,
             emails=extract_emails_from_text(description) if description else None,
             num_urgent_words=count_urgent_words(description) if description else None,
diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py
index 5e5ffb0..c44b875 100644
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -1,4 +1,5 @@
 import re
+import numpy as np
 
 import requests
 import tls_client
@@ -62,3 +63,19 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
         if job_type_str in job_type.value:
             res = job_type
     return res
+
+def currency_parser(cur_str):
+    # Remove any non-numerical characters
+    # except for ',' '.' or '-' (e.g. EUR)
+    cur_str = re.sub("[^-0-9.,]", '', cur_str)
+    # Remove any 000s separators (either , or .)
+    cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
+
+    if '.' in list(cur_str[-3:]):
+        num = float(cur_str)
+    elif ',' in list(cur_str[-3:]):
+        num = float(cur_str.replace(',', '.'))
+    else:
+        num = float(cur_str)
+
+    return np.round(num, 2)

From dfb8c18c518496009fb6210e7e526e5358c720f8 Mon Sep 17 00:00:00 2001
From: Faraz Khan <frzk410@gmail.com>
Date: Sat, 11 Nov 2023 03:59:42 +0500
Subject: [PATCH 4/4] include location with 3 parts (#69)

---
 pyproject.toml                           | 2 +-
 src/jobspy/scrapers/linkedin/__init__.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 12a694d..08272d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.27"
+version = "1.1.28"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
index 5fcc696..67d2898 100644
--- a/src/jobspy/scrapers/linkedin/__init__.py
+++ b/src/jobspy/scrapers/linkedin/__init__.py
@@ -278,5 +278,12 @@ class LinkedInScraper(Scraper):
                     state=state,
                     country=Country.from_string(self.country),
                 )
+            elif len(parts) == 3:
+                city, state, country = parts
+                location = Location(
+                    city=city,
+                    state=state,
+                    country=Country.from_string(country),
+                )
 
         return location