Using Python as webscrapping Tool

Post by **admin** » Tue Jun 24, 2025 11:08 am

Prompt

https://disc.icai.org/reports-orders-of-the-disciplinary-committee-for-the-council-year-2023-2024/


Please generate a Python script to Download all the PDF caselaws from the above webpage

The Generated Code is like

Code: Select all

pip intall requests


import os
import requests
from bs4 import BeautifulSoup

# URL of the webpage containing the reports and orders
BASE_URL = "https://disc.icai.org/reports-orders-of-the-disciplinary-committee-for-the-council-year-2024-2025/"

# Create a directory to save the orders
DOWNLOAD_DIR = "ICAI_Orders"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

def download_file(url, filename):
    """Download a file from a URL and save it locally."""
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                f.write(chunk)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {url} (Status code: {response.status_code})")

def scrape_and_download(base_url):
    """Scrape the webpage and download all order files."""
    try:
        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to access the webpage: {base_url} (Status code: {response.status_code})")
            return
        
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all links to PDF or other files
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            if href.endswith('.pdf'):  # Assuming orders are in PDF format
                file_url = href if href.startswith('http') else BASE_URL + href
                filename = os.path.join(DOWNLOAD_DIR, os.path.basename(file_url))
                download_file(file_url, filename)
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the scraper
scrape_and_download(BASE_URL)