banner
andrewji8

Being towards death

Heed not to the tree-rustling and leaf-lashing rain, Why not stroll along, whistle and sing under its rein. Lighter and better suited than horses are straw sandals and a bamboo staff, Who's afraid? A palm-leaf plaited cape provides enough to misty weather in life sustain. A thorny spring breeze sobers up the spirit, I feel a slight chill, The setting sun over the mountain offers greetings still. Looking back over the bleak passage survived, The return in time Shall not be affected by windswept rain or shine.
telegram
twitter
github

網站克隆爬蟲(附Python代碼)

import argparse
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import zipfile
import re

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
SAVE_DIR = 'sites'

def make_request(url, refer='', proxy_url=''):
    """發送 HTTP 請求,並可選擇性地添加 refer 和代理。"""
    headers = {
        'User-Agent': USER_AGENT
    }
    if refer:
        headers['Referer'] = refer
    if proxy_url:
        url = proxy_url + url
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"發送請求時出錯: {e}")
        return None

def is_file_link(url):
    """檢查 URL 是否指向一個文件。"""
    path = urlparse(url).path
    return bool(re.search(r'\.\w+$', path))

def save_file(file_url, base_url, save_dir, refer='', proxy_url=''):
    """下載並將文件保存在本地。"""
    if file_url.startswith('//'):
        full_url = urlparse(base_url).scheme + ':' + file_url
    elif file_url.startswith(('http://', 'https://')):
        full_url = file_url
    else:
        full_url = urljoin(base_url.rstrip('/') + '/', file_url.lstrip('/'))

    parsed_url = urlparse(full_url)
    path = parsed_url.path
    extension = os.path.splitext(path)[1][1:] or 'unknown'
    filename = os.path.basename(path)

    extension_dir = os.path.join(save_dir, extension)
    os.makedirs(extension_dir, exist_ok=True)
    file_path = os.path.join(extension_dir, filename)

    try:
        response = requests.get(full_url, headers={'User-Agent': USER_AGENT, 'Referer': refer} if refer else {'User-Agent': USER_AGENT})
        response.raise_for_status()

        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"已保存文件: {full_url}")
        return file_path
    except requests.exceptions.RequestException as e:
        print(f"保存文件 {full_url} 時出錯: {e}")
        return None

def process_file_tags(soup, base_url, save_dir, refer, proxy_url):
    """處理包含文件引用的 HTML 標籤。"""
    file_tags = {
        'script': 'src',
        'link': 'href',
        'img': 'src',
        'audio': 'src',
        'video': 'src',
        'source': 'src',
        'img': 'data-original'
    }
    modified_content = str(soup)
    for tag, attribute in file_tags.items():
        elements = soup.find_all(tag, attrs={attribute: True})
        for element in elements:
            file_url = element[attribute]
            print(f"檢測到的鏈接: {file_url}")

            if is_file_link(file_url):
                new_file_path = save_file(file_url, base_url, save_dir, refer, proxy_url)
                if new_file_path:
                    extension = os.path.splitext(new_file_path)[1][1:]
                    relative_path = f'./{extension}/{os.path.basename(new_file_path)}'
                    modified_content = modified_content.replace(file_url, relative_path)
            else:
                print(f"跳過鏈接: {file_url}")

    return modified_content

def zip_directory(source_dir, zip_path):
    """創建下載文件的 ZIP 壓縮包。"""
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(source_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, source_dir)
                zipf.write(file_path, arcname)

def main():
    parser = argparse.ArgumentParser(description='網站鏡像工具')
    parser.add_argument('url', help='要鏡像的 URL')
    parser.add_argument('--refer', help='來源 URL', default='')
    parser.add_argument('--proxy', help='代理 URL', default='')
    args = parser.parse_args()

    parsed_url = urlparse(args.url)
    host = parsed_url.netloc.replace('.', '')
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

    save_dir = os.path.join(SAVE_DIR, host)
    os.makedirs(save_dir, exist_ok=True)

    html_content = make_request(args.url, args.refer, args.proxy)
    if not html_content:
        print("無法獲取網頁")
        return

    soup = BeautifulSoup(html_content, 'html.parser')
    modified_content = process_file_tags(soup, base_url, save_dir, args.refer, args.proxy)

    html_file_path = os.path.join(save_dir, f"{host}.html")
    with open(html_file_path, 'w', encoding='utf-8') as f:
        f.write(modified_content)
    print(f"已保存 HTML 文件: {html_file_path}")

    zip_path = os.path.join(SAVE_DIR, f"{host}.zip")
    zip_directory(save_dir, zip_path)
    print(f"已創建 ZIP 壓縮包: {zip_path}")

if __name__ == '__main__':
    main()

主要功能:
在當前目錄下創建一個 sites 文件夾
在 sites 文件夾中按照域名創建子文件夾
下載的文件會按類型分類存放 (如 images 文件夾存放圖片,css 文件夾存放樣式表等)
自動修改 HTML 文件中的鏈接,指向本地文件
最後生成一個 ZIP 壓縮包,包含所有下載的文件

例如,如果你下載 https://example.com:
HTML 文件會保存為 sites/examplecom/examplecom.htm
所有文件會壓縮為 sites/examplecom.zip
圖片會保存在 sites/examplecom/images/ 目錄
CSS 文件會保存在 sites/examplecom/css/ 目錄
JavaScript 文件會保存在 sites/examplecom/js/ 目錄

運行時會在控制台顯示
: 檢測到的文件鏈接
下載保存的文件路徑
最終生成的 HTML 和 ZIP 文件位置

載入中......
此文章數據所有權由區塊鏈加密技術和智能合約保障僅歸創作者所有。