exceptions
, docstrings
, venv
<tags>
“Can I read this on my Kindle?”
“Can I read this on my Kindle?”
<tags>
tag.subTag.subsubTag
# Find headline of text
headline = article.h2.a.text
find
/find_all
Syntax
.find(tag, attributes, recursive, text, keywords)
.find_all(tag, attributes, recursive, text, limit, keywords)
# [tag] Find all headings in the page
.find_all('h1')
.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
# [attributes] Find all <span> that contain green/red colored text
.find_all('span', {'class': {'green', 'red'}})
# [text] How many times is "Happy Birthday" displayed on the webpage?
.find_all(text='Happy Birthday')
# [keywords]
.find_all(id='span', class_={'green', 'red'})
# Find all title-summary combinations that are colored in green or red
.find_all('div', id={'title','summary'}, class_={'green', 'red'})
“Can I read this on my Kindle?”
<div class="judgments">...</div>
/doc/594125/
” instead of “https://indiankanoon.org/doc/594125/
”#!/usr/bin/env python3
from bs4 import BeautifulSoup
import sys
from urllib import request
urlBase = 'https://indiankanoon.org'
def generateHtml(urlId):
url = f'{urlBase}/doc/{urlId}'
try:
response = request.urlopen(url).read().decode('utf-8')
html = BeautifulSoup(response, 'lxml')
judgement = html.find('div', class_='judgments')
title = judgement.find('div', class_='doc_title').text
content = f'''
<html>
<head><title>{title}</title></head>
<body>{judgement}</body>
</html>
'''
with open(f'{urlId}.html', 'w') as f:
f.write(content)
except Exception as e:
print(e)
return None
if __name__ == '__main__':
generateHtml(sys.argv[1])
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import sys
from urllib import request
urlBase = 'https://indiankanoon.org'
def generateHtml(urlId):
url = f'{urlBase}/doc/{urlId}'
try:
response = request.urlopen(url).read().decode('utf-8')
html = BeautifulSoup(response, 'lxml')
judgement = html.find('div', class_='judgments')
title = judgement.find('div', class_='doc_title').text
content = f'''
<html>
<head><title>{title}</title></head>
<body>{judgement}</body>
</html>
'''
# [1] <-- Process the links
with open(f'{urlId}.html', 'w') as f:
f.write(content)
# [2] <-- Automatically open the file
except Exception as e:
print(e)
return None
if __name__ == '__main__':
generateHtml(sys.argv[1])
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request
urlBase = 'https://indiankanoon.org'
def generateHtml(urlId):
url = f'{urlBase}/doc/{urlId}'
try:
response = request.urlopen(url).read().decode('utf-8')
html = BeautifulSoup(response, 'lxml')
judgement = html.find('div', class_='judgments')
title = judgement.find('div', class_='doc_title').text
content = f'''
<html>
<head><title>{title}</title></head>
<body>{judgement}</body>
</html>
'''
content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
fr'''\1{urlBase}\2"''', content)
with open(f'{urlId}.html', 'w') as f:
f.write(content)
subprocess.run(f'''start {urlId}.html''', shell=True)
# [3] <-- Save ebook (epub/mobi)
except Exception as e:
print(e)
return None
if __name__ == '__main__':
generateHtml(sys.argv[1])
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request
urlBase = 'https://indiankanoon.org'
def generateMobi(urlId):
url = f'{urlBase}/doc/{urlId}'
try:
response = request.urlopen(url).read().decode('utf-8')
html = BeautifulSoup(response, 'lxml')
judgement = html.find('div', class_='judgments')
title = judgement.find('div', class_='doc_title').text
content = f'''
<html>
<head><title>{title}</title></head>
<body>{judgement}</body>
</html>
'''
content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
fr'''\1{urlBase}\2"''', content)
with open(f'{urlId}.html', 'w') as f:
f.write(content)
subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
subprocess.run(f'''start {urlId}.epub''', shell=True)
except Exception as e:
print(e)
return None
if __name__ == '__main__':
generateMobi(sys.argv[1])
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request
urlBase = 'https://indiankanoon.org'
def generateMobi(urlId):
url = f'{urlBase}/doc/{urlId}'
try:
response = request.urlopen(url).read().decode('utf-8') # [1]
html = BeautifulSoup(response, 'lxml')
judgement = html.find('div', class_='judgments')
title = judgement.find('div', class_='doc_title').text
content = f'''
<html>
<head><title>{title}</title></head>
<body>{judgement}</body>
</html>
'''
content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
fr'''\1{urlBase}\2"''', content)
with open(f'{urlId}.html', 'w') as f:
f.write(content)
subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
subprocess.run(f'''start {urlId}.epub''', shell=True)
except Exception as e:
print(e)
return None
if __name__ == '__main__':
generateMobi(sys.argv[1])
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request, error
urlBase = 'https://indiankanoon.org'
def generateMobi(urlId):
url = f'{urlBase}/doc/{urlId}'
try:
response = request.urlopen(url).read().decode('utf-8') # [1]
html = BeautifulSoup(response, 'lxml')
judgement = html.find('div', class_='judgments')
title = judgement.find('div', class_='doc_title').text
content = f'''
<html>
<head><title>{title}</title></head>
<body>{judgement}</body>
</html>
'''
content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
fr'''\1{urlBase}\2"''', content)
with open(f'{urlId}.html', 'w') as f:
f.write(content)
subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
subprocess.run(f'''start {urlId}.epub''', shell=True)
except error.HTTPError as e:
print(e)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
return None
if __name__ == '__main__':
generateMobi(sys.argv[1])
docstring
s)#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request, error
urlBase = 'https://indiankanoon.org'
def generateMobi(urlId=None):
'''
Scrapes & Generates html/epub/mobi versions of document.
'''
url = f'{urlBase}/doc/{urlId}'
try:
response = request.urlopen(url).read().decode('utf-8')
html = BeautifulSoup(response, 'lxml')
judgement = html.find('div', class_='judgments')
title = judgement.find('div', class_='doc_title').text
content = f'''
<html>
<head><title>{title}</title></head>
<body>{judgement}</body>
</html>
'''
content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
fr'''\1{urlBase}\2"''', content)
with open(f'{urlId}.html', 'w') as f:
f.write(content)
subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
subprocess.run(f'''start {urlId}.epub''', shell=True)
except error.HTTPError as e:
print(e)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
return None
if __name__ == '__main__':
generateMobi(sys.argv[1])
venv
)requirements.txt
)python -m venv myProject\venv; cd myProject
venv\Scripts\activate.bat
pip list
pip install [package-name]
pip install -r requirements.txt
pip freeze > requirements.txt
deactivate
rmdir /s venv
Web Scraping is completely legal. However, keep a few things in mind:
Scrape the article on this webpage, and create your own ebook using the code used in this talk.
Use this source-code as reference for the proposed problem.
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import subprocess
import sys
from urllib import request
urlBase = 'https://www.fullstackpython.com/blog/'
def generateHtml(urlId):
'''
Scrapes & generates html version of a document.
'''
url = f'{urlBase}{urlId}'
try:
# Get page, filter the contents & save as a new HTML page
response = request.urlopen(url).read().decode('utf-8')
html = BeautifulSoup(response, 'lxml')
entries = html.find('div', class_='cn').find_all('div', class_='row')
title = entries[1].h1.text
author = entries[1].a.text
blog = entries[2]
content = f'''
<html>
<head><title>{title}</title></head>
<body>
<h1>{title}</h1>
<h2>{author}</h2>
{blog}
</body>
</html>
'''
# Save & Open HTML file
with open(f'TEST.html', 'w', encoding='utf-8') as f:
f.write(content)
subprocess.run(f"start TEST.html", shell=True)
except Exception as e:
print(e)
return None
if __name__ == '__main__':
generateHtml(sys.argv[1])
# py test.py "first-steps-gitpython.html"
[1] “Web Scraping using Python” by Corey Schafer
[2] “RegEx using Python” by Corey Schafer
[3] “Python venv (Windows)” by Corey Schafer
[4] “Web Scraping with Python” by Ryan Mitchell
[5] “Legal Aspects” by Data Carpentry