使用Python爬取jekyll文档

开发日常

Publish Date: 2017-02-19

开发环境配置
Ubuntu: 16.10
Python: 3.5.2+
wkhtmltopdf: 0.12.4 (with patched qt)

获取页面URL列表 ^1

def get_url_list(url):
    response = requests.get(url)
    if response.status_code != 200:
        return
    soup = BeautifulSoup(response.content, "html5lib")
    aside_links = soup.find_all("aside")[1]  # 获取侧边栏内容
    urls = []
    for li in aside_links.find_all("li"):
        # 侧边栏 所有链接
        a_url = "https://jekyllrb.com/" + li.a.get("href")
        urls.append(a_url)

    return urls

解析页面, 保存成html文件

def parse_url_to_html(url):
    response = requests.get(url)
    if response.status_code != 200:
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    body = soup.find_all(class_="unit four-fifths")[0]  # 获取文档内容
    html = str(body)
    html = html_template.format(content=html)
    file_path = "./data/demo.html"
    with open(file_path, "wb") as f:
        f.write(bytes(html, "utf_8"))  # string转byte

    return file_path

将html文件保存为`pdf`

def save_pdf(file_path):
    options = &#123;
        'page-size': 'Letter',
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ]
    &#125;
    pdfkit.from_file(file_path, "./data/demo.pdf", options=options)