2024年12月14日 星期六

用Python爬取網站內容:SEO 分析與評分

以下程式是由ChatGPT所產生。

安裝套件:

pip install requests beautifulsoup4


範例一、網站標題和描述

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
import requests
from bs4 import BeautifulSoup

def seo_scraper(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.title.string if soup.title else 'No Title'
    meta_description = soup.find('meta', attrs={'name': 'description'})
    description = meta_description['content'] if meta_description else 'No Description'

    print(f"網站標題: {title}")
    print(f"網站描述: {description}")

# 測試範例
seo_scraper('https://example.com')

範例二、提取 SEO 相關資訊,如標題、描述、H1 標籤及所有內部連結

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def seo_analyze(url):
    try:
        # 發送 GET 請求
        response = requests.get(url)
        response.raise_for_status()  # 確保請求成功

        # 使用 BeautifulSoup 解析 HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # 提取標題
        title = soup.title.string if soup.title else 'No Title'
        print(f"網站標題: {title}")

        # 提取 meta 描述
        meta_description = soup.find('meta', attrs={'name': 'description'})
        description = meta_description['content'] if meta_description else 'No Description'
        print(f"網站描述: {description}")

        # 提取 H1 標籤
        h1_tags = soup.find_all('h1')
        h1_texts = [h1.get_text(strip=True) for h1 in h1_tags]
        print(f"H1 標籤: {h1_texts if h1_texts else 'No H1 Tags'}")

        # 提取內部連結
        internal_links = set()
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_url = urljoin(url, href)
            if url in full_url:
                internal_links.add(full_url)

        print("\n內部連結:")
        for link in internal_links:
            print(link)

    except requests.RequestException as e:
        print(f"請求失敗: {e}")

# 測試範例
website_url = 'https://example.com'  # 替換為你要分析的網址
seo_analyze(website_url)

範例三、檢查圖片的 Alt 屬性,確保圖片有適當的描述
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def seo_analyze(url):
    try:
        # 發送 GET 請求
        response = requests.get(url)
        response.raise_for_status()  # 確保請求成功

        # 使用 BeautifulSoup 解析 HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # 提取標題
        title = soup.title.string if soup.title else 'No Title'
        print(f"網站標題: {title}")

        # 提取 meta 描述
        meta_description = soup.find('meta', attrs={'name': 'description'})
        description = meta_description['content'] if meta_description else 'No Description'
        print(f"網站描述: {description}")

        # 提取 H1 標籤
        h1_tags = soup.find_all('h1')
        h1_texts = [h1.get_text(strip=True) for h1 in h1_tags]
        print(f"H1 標籤: {h1_texts if h1_texts else 'No H1 Tags'}")

        # 提取內部連結
        internal_links = set()
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_url = urljoin(url, href)
            if url in full_url:
                internal_links.add(full_url)

        print("\n內部連結:")
        for link in internal_links:
            print(link)

    except requests.RequestException as e:
        print(f"請求失敗: {e}")

# 測試範例
website_url = 'https://example.com'  # 替換為你要分析的網址
seo_analyze(website_url)

範例四、提取所有標題 (H1–H6),來分析標題結構是否符合 SEO 最佳實踐

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import requests
from bs4 import BeautifulSoup

def extract_headings(url):
    try:
        # 發送 GET 請求
        response = requests.get(url)
        response.raise_for_status()  # 確保請求成功

        # 使用 BeautifulSoup 解析 HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # 定義標題標籤列表
        heading_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']

        # 存放標題及其層級
        headings = []

        for tag in heading_tags:
            for heading in soup.find_all(tag):
                headings.append((tag.upper(), heading.get_text(strip=True)))

        if not headings:
            print("未找到任何標題標籤。")
            return

        # 輸出標題結構
        print(f"共找到 {len(headings)} 個標題標籤:\n")
        for level, text in headings:
            print(f"{level}: {text}")

        # 分析標題層級
        analyze_heading_structure(headings)

    except requests.RequestException as e:
        print(f"請求失敗: {e}")

def analyze_heading_structure(headings):
    print("\n🔍 標題結構分析:")
    previous_level = 0
    level_map = {"H1": 1, "H2": 2, "H3": 3, "H4": 4, "H5": 5, "H6": 6}

    for level, text in headings:
        current_level = level_map[level]

        # 檢查是否跳過標題層級
        if previous_level and current_level > previous_level + 1:
            print(f"⚠️ 標題層級跳躍:從 {previous_level} 跳到 {current_level} - '{text}'")

        previous_level = current_level

    print("✅ 標題層級檢查完成。")

# 測試範例
website_url = 'https://example.com'  # 請替換為你要檢查的網址
extract_headings(website_url)

範例五、SEO評分
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def seo_score(url):
    try:
        # 發送 GET 請求
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        print(f"\n🔍 正在分析網站: {url}\n")

        # 初始化分數
        score = 0
        max_score = 100
        deductions = []

        # 1. 檢查標題標籤 (Title)
        title = soup.title.string if soup.title else None
        if title:
            score += 15
        else:
            deductions.append("❌ 缺少標題標籤 (Title) (-15分)")

        # 2. 檢查 Meta 描述
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc and meta_desc.get("content"):
            score += 15
        else:
            deductions.append("❌ 缺少 Meta 描述 (-15分)")

        # 3. 檢查 H1 標籤
        h1 = soup.find("h1")
        if h1:
            score += 10
        else:
            deductions.append("❌ 缺少 H1 標籤 (-10分)")

        # 4. 檢查圖片的 Alt 屬性
        images = soup.find_all("img")
        images_with_alt = [img for img in images if img.get("alt")]
        if images and len(images_with_alt) == len(images):
            score += 15
        elif images:
            deductions.append(f"❌ 部分圖片缺少 Alt 屬性 (-10分)")
            score += 5
        else:
            deductions.append("❌ 未找到圖片 (-15分)")

        # 5. 檢查內部連結數量
        internal_links = set()
        domain = urlparse(url).netloc
        for link in soup.find_all("a", href=True):
            full_url = urljoin(url, link["href"])
            if urlparse(full_url).netloc == domain:
                internal_links.add(full_url)
        if len(internal_links) >= 3:
            score += 15
        else:
            deductions.append(f"❌ 內部連結少於 3 個 (-15分)")

        # 6. 檢查外部連結數量
        external_links = set()
        for link in soup.find_all("a", href=True):
            full_url = urljoin(url, link["href"])
            if urlparse(full_url).netloc != domain:
                external_links.add(full_url)
        if len(external_links) >= 1:
            score += 10
        else:
            deductions.append("❌ 缺少外部連結 (-10分)")

        # 7. 檢查 HTTPS
        if urlparse(url).scheme == "https":
            score += 20
        else:
            deductions.append("❌ 未使用 HTTPS (-20分)")

        # 顯示評分
        print(f"✅ SEO 總分: {score}/{max_score}\n")

        if deductions:
            print("🔻 扣分項目:")
            for deduction in deductions:
                print(deduction)
        else:
            print("🎉 恭喜!所有 SEO 檢查項目都通過了。")

    except requests.RequestException as e:
        print(f"❌ 請求失敗: {e}")

# 測試範例
website_url = 'https://example.com'  # 請替換為你要分析的網址
seo_score(website_url)

沒有留言:

張貼留言