反正也關了,放上來以資紀念。只是一個嘗試,不過因爲有圖片,不知道怎麼抓,失敗,放棄。

#!/usr/bin/env python

# Filename: noname.py


from sgmllib import SGMLParser
from urllib.request import Request
from urllib.request import urlopen
import re

account = 'arbodino'
url = 'http://www.wretch.cc/blog/'+account+'&list=1'

hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

# establish webpage connection and get page html source

def getsource(url):
    req = Request(url, headers=hdr)
    return str(urlopen(req).read())

# class code comes from diveintopython.net 

class URLLister(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.urls = []

    def start_a(self, attrs):                     
        href = [v for k, v in attrs if k=='href'] 
        if href:
            self.urls.extend(href)
            
parser = URLLister()
parser.feed(getsource(url))
parser.close()
for url in parser.urls:
    if url[0:27+len(account)] == 'http://www.wretch.cc/blog/'+account+'/':   
        print(url)

Comments

comments powered by Disqus