Notes
Notes - notes.io |
"""
KSU Pharmacy Faculty Directory Scraper
Extracts professor names and emails from Pharmacy department (282 faculty members)
URL: https://faculty.ksu.edu.sa/en/faculty?field_faculty_college_target_id=22
"""
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin, urlencode
import sys
from typing import List, Dict
BASE_URL = 'https://faculty.ksu.edu.sa'
PHARMACY_ID = 22
FACULTY_LIST_URL = f'{BASE_URL}/en/faculty'
MAX_PAGES = 19 # Pharmacy has 19 pages (0-18 with 15 per page = 282 members)
RATE_LIMIT = 0.3 # seconds between requests
DOWNLOAD_TIMEOUT = 10
class KSUPharmacyFacultyWebScraper:
def __init__(self, output_file='ksu_pharmacy_faculty.csv'):
self.output_file = output_file
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.faculty_data = []
self.total_errors = 0
def get_page(self, url: str) -> bytes:
"""Fetch a page with error handling"""
try:
response = self.session.get(url, timeout=DOWNLOAD_TIMEOUT)
response.raise_for_status()
return response.content
except requests.RequestException as e:
self.total_errors += 1
print(f"[ERROR] Failed to fetch: {e}", file=sys.stderr)
return None
def extract_profiles_from_page(self, page_num: int) -> List[Dict]:
"""Extract profile links from a Pharmacy faculty list page"""
# Build URL with Pharmacy filter (field_faculty_college_target_id=22)
params = {
'search': '',
'field_faculty_college_target_id': PHARMACY_ID,
'field_faculty_degree_value': 'All',
'sort_by': 'created',
'sort_order': 'DESC',
'items_per_page': 15,
'page': page_num
}
url = f'{FACULTY_LIST_URL}?{urlencode(params)}'
content = self.get_page(url)
if not content:
return []
try:
soup = BeautifulSoup(content, 'html.parser')
profiles = []
# Find all heading elements that contain profile links
headings = soup.find_all('heading')
for heading in headings:
link = heading.find('a')
if link and 'href' in link.attrs:
name = link.get_text(strip=True)
href = link.get('href')
if name and href and '/en/' in href:
profile_url = urljoin(BASE_URL, href)
profiles.append({'name': name, 'url': profile_url})
return profiles
except Exception as e:
print(f"[ERROR] Failed to parse page {page_num}: {e}", file=sys.stderr)
self.total_errors += 1
return []
def extract_email_from_profile(self, profile_url: str) -> str:
"""Extract email from a faculty profile page"""
content = self.get_page(profile_url)
if not content:
return 'FETCH_ERROR'
try:
soup = BeautifulSoup(content, 'html.parser')
# Find email links
email_links = soup.find_all('a', href=lambda x: x and 'mailto:' in x)
if email_links:
# Get the first email link's text
email = email_links[0].get_text(strip=True)
return email if email else 'PARSE_ERROR'
return 'NOT_FOUND'
except Exception as e:
print(f"[ERROR] Failed to parse profile: {e}", file=sys.stderr)
self.total_errors += 1
return 'PARSE_ERROR'
def scrape_all(self) -> List[Dict]:
"""Main scraping function for Pharmacy faculty only"""
print("="*70)
print("KSU PHARMACY FACULTY DIRECTORY SCRAPER")
print("="*70)
print(f"Target: Pharmacy Department (282 faculty members)")
print(f"Pages: {MAX_PAGES} pages (0-{MAX_PAGES-1}) × 15 per page")
print(f"Output: {self.output_file}")
print("="*70)
print()
total_profiles = 0
for page_num in range(MAX_PAGES):
print(f"[Page {page_num+1:2d}/{MAX_PAGES}] Fetching Pharmacy faculty...", end=' ', flush=True)
profiles = self.extract_profiles_from_page(page_num)
if not profiles:
print("No profiles found, stopping.")
break
print(f"Found {len(profiles)} profiles")
for idx, profile in enumerate(profiles, 1):
name_short = profile['name'][:35].ljust(35)
print(f" [{idx:2d}/{len(profiles):2d}] {name_short}...", end=' ', flush=True)
email = self.extract_email_from_profile(profile['url'])
self.faculty_data.append({
'name': profile['name'],
'email': email,
'profile_url': profile['url']
})
print(f"✓ {email}")
total_profiles += 1
time.sleep(RATE_LIMIT)
time.sleep(1)
self.save_to_csv()
print()
print("="*70)
print("PHARMACY SCRAPING COMPLETE")
print("="*70)
print(f"Total pharmacy faculty extracted: {total_profiles}")
print(f"Expected: 282")
print(f"Total errors: {self.total_errors}")
print(f"Saved to: {self.output_file}")
print("="*70)
return self.faculty_data
def save_to_csv(self):
"""Save faculty data to CSV file"""
try:
with open(self.output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'email', 'profile_url'])
writer.writeheader()
writer.writerows(self.faculty_data)
print(f"n[SAVE] {len(self.faculty_data)} Pharmacy faculty saved to {self.output_file}")
except IOError as e:
print(f"[ERROR] Failed to save CSV: {e}", file=sys.stderr)
self.total_errors += 1
if __name__ == '__main__':
print("nStarting KSU Pharmacy Faculty Directory Scraper...n")
scraper = KSUPharmacyFacultyWebScraper('ksu_pharmacy_faculty.csv')
scraper.scrape_all()
![]() |
Notes is a web-based application for online taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000+ notes created and continuing...
With notes.io;
- * You can take a note from anywhere and any device with internet connection.
- * You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
- * You can quickly share your contents without website, blog and e-mail.
- * You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
- * Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.
Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.
Easy: Notes.io doesn’t require installation. Just write and share note!
Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )
Free: Notes.io works for 14 years and has been free since the day it was started.
You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;
Email: [email protected]
Twitter: http://twitter.com/notesio
Instagram: http://instagram.com/notes.io
Facebook: http://facebook.com/notesio
Regards;
Notes.io Team
