Creating eBooks from Webpages using Python






Chaitanya Tejaswi


21st April, 2020

Objectives

Create eBooks from Webpages

Assumptions

Dependencies

Motivation for This Talk

“Can I read this on my Kindle?”

The Solution: On Amazon Kindle

The Solution: On Android Device

How To Do It?

“Can I read this on my Kindle?”

  1. Send an HTTP request to the server for the file.
  2. Get the file, filter out the “title” & “judgement” (summary).
  3. Save this to a text/html file.
  4. Convert this file to an eBook, particularly one that is compatible with Android & Kindle.

But First, Some Prerequisites

Send A Request, Retrieve A File

from urllib import request
...
response = request.urlopen(url).read().decode('utf-8')

Create An HTML object

from bs4 import BeautifulSoup
...
html = BeautifulSoup(response, 'lxml')

Finding <tags>

# Find headline of text
headline = article.h2.a.text

Syntax

.find(tag, attributes, recursive, text, keywords)
.find_all(tag, attributes, recursive, text, limit, keywords)
# [tag] Find all headings in the page
.find_all('h1')
.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

# [attributes] Find all <span> that contain green/red colored text
.find_all('span', {'class': {'green', 'red'}})

# [text] How many times is "Happy Birthday" displayed on the webpage?
.find_all(text='Happy Birthday')

# [keywords]
.find_all(id='span', class_={'green', 'red'})
Note: OR/AND
# Find all title-summary combinations that are colored in green or red
.find_all('div', id={'title','summary'}, class_={'green', 'red'})

Observations

“Can I read this on my Kindle?”

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        # [1] <-- Process the links
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        # [2] <-- Automatically open the file
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''start {urlId}.html''', shell=True)
        # [3] <-- Save ebook (epub/mobi)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Disclaimer: Don’t Use This In Production Code

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8') # [1]
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Slightly Better

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request, error

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8') # [1]
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except error.HTTPError as e:
        print(e)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Document Your Code (using docstrings)

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request, error

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId=None):
    '''
    Scrapes & Generates html/epub/mobi versions of document.
    '''
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except error.HTTPError as e:
        print(e)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Use Virtual Environments (venv)

Why Virtual Environments?
To isolate packages used in a project from the packages installed on the system.

Steps
python -m venv myProject\venv; cd myProject
venv\Scripts\activate.bat
pip list
pip install [package-name]
pip install -r requirements.txt
pip freeze > requirements.txt
deactivate
rmdir /s venv
Example

Homework: Problem

Make Your Own eBook

Scrape the article on this webpage, and create your own ebook using the code used in this talk.

Steps
  1. Visit the Dependencies page of this talk and install all necessary software.
  2. Modify final code to capture the article’s heading & main-content.
  3. Create an HTML & save it locally.
  4. Convert this HTML file to EPUB, and try opening it on your phone using Google’s Play Books app.
Solutions will be posted on Saturday (25-04-2020)

Homework: Solution

Use this source-code as reference for the proposed problem.

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import subprocess
import sys
from urllib import request

urlBase = 'https://www.fullstackpython.com/blog/'


def generateHtml(urlId):
    '''
    Scrapes & generates html version of a document.
    '''
    url = f'{urlBase}{urlId}'
    try:
        # Get page, filter the contents & save as a new HTML page
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        entries = html.find('div', class_='cn').find_all('div', class_='row')
        title = entries[1].h1.text
        author = entries[1].a.text
        blog = entries[2]
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>
                <h1>{title}</h1>
                <h2>{author}</h2>
                {blog}
            </body>
        </html>
        '''
        # Save & Open HTML file
        with open(f'TEST.html', 'w', encoding='utf-8') as f:
            f.write(content)
        subprocess.run(f"start TEST.html", shell=True)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

# py test.py "first-steps-gitpython.html"

References

[1] “Web Scraping using Python” by Corey Schafer
[2] “RegEx using Python” by Corey Schafer
[3] “Python venv (Windows)” by Corey Schafer
[4] “Web Scraping with Python” by Ryan Mitchell
[5] “Legal Aspects” by Data Carpentry