Dec 17-23, 2020 Vikrant Patil
These notes are available online at http://notes.pipal.in/2020/arcesium_finop_batch3/module3-day4.html
© Pipal Academy LLP
Day 1 | Day 2 | Day 3 | Day 4 | Day 5
We will be using jupyter hub from http://lab.pipal.in for this training. Create a notebook with name module3-day4.ipynb for today's session. Before you start shutdown all kernels except today's notebook.
docs for python-selenium
https://selenium-python.readthedocs.io/installation.html#drivers
download geckodriver for the browser you need to launch from python
for firefox
https://github.com/mozilla/geckodriver/releases
create virtual environment for selenium
python -m venv firefox_selenium
activate it using (windows)
firefox_selenium\Scripts\activate.bat
for linux/max
source firefox_selenium/bin/activate
geckodriver for windows will have zip file, which has geckodriver.exe in it. unzip it and copy it in firefox_selenium\Scripts\
for linux/mac users copy the unzipped executable in firefox_selenium/bin/
pip install selenium
%%file search_arcesium.py
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.arcesium.com/")
careers = driver.find_element_by_class_name("careers-anchor")
careers.click()
driver.close()
documentation for selenium using python
to read pdf files we will need a package called PyPDF2
from jupyter
!pip install PyPDF2
from cmd
pip install PyPDF2
!pip install PyPDF2
!cat download.py
!python download.py https://posoco.in/download/16-07-20_nldc_psp/?wpdmdl=30215 demanddata.pdf
We will try to read this pdf file https://posoco.in/download/16-07-20_nldc_psp/?wpdmdl=30215 and try to exctract table A from page 2
import PyPDF2
with open("demanddata.pdf", "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
n = pdfreader.getNumPages()
page = pdfreader.getPage(1)
print(page.extractText()[:100])
def print_pdf_text(filename):
with open("demanddata.pdf", "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
n = pdfreader.getNumPages()
for p in range(n):
page = pdfreader.getPage(p)
print(page.extractText()[:500])
print("="*10)
print_pdf_text("demanddata.pdf")
def get_page(pdffile, pageno):
with open("demanddata.pdf", "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
page = pdfreader.getPage(pageno)
return page.extractText()
print(get_page("demanddata.pdf", 1))
def chunk(items, n, count):
"""
[i1, i2, i3, i4, i5, i6, i7 ....i100]
will break it into peices of size n
"""
s = 0
for i in range(count):
yield items[s:s+n] # n items from items, starting s position
s = (i+1)*n
def extract_table_A(pagetext):
lines = pagetext("\n")
header = "NR WR SR ER NER TOTAL"
headers = header.strip().split()
data = {}
steps = chunk(lines, len(headers))
for row in chunk(list(range(20)), 3, 4):
print(row)
import random
def randomnums(n):
for i in range(n):
yield random.random()
ran = randomnums(5)
ran
r = reversed([1, 2, 3, 4, 5])
r
next(r)
next(r)
next(r)
next(r)
next(r)
next(r)
ran
next(ran)
next(ran)
next(ran)
ran
def randomnums(n):
print("Start generator")
for i in range(n):
print("yielding ...", i)
yield random.random()
print("Back to loop")
print("End of generator")
ran = randomnums(3)
next(ran)
next(ran)
random.random()
def nhellos(n):
print("Start generator")
for i in range(n):
print("yielding ...", i)
yield "hello"
print("Back to loop")
print("End of generator")
h = nhellos(2)
next(h)
s = next(h)
s
next(h)
for r in randomnums(4):
print(r)
def chunk(items, n, count):
"""
[i1, i2, i3, i4, i5, i6, i7 ....i100]
will break it into peices of size n
"""
s = 0
for i in range(count):
yield items[s:s+n] # n items from items, starting s position
s = (i+1)*n
def extract_table_A(pagetext):
lines = pagetext.split("\n")
header = "NR WR SR ER NER TOTAL"
headers = header.strip().split()
data = {}
steps = chunk(lines, len(headers), 9)
next(steps)
for row in steps:
print(row)
extract_table_A(get_page("demanddata.pdf", 1))
import pandas as pd
def chunk(items, n, count):
"""
[i1, i2, i3, i4, i5, i6, i7 ....i100]
will break it into peices of size n
"""
s = 0
for i in range(count):
yield items[s:s+n] # n items from items, starting s position
s = (i+1)*n
def extract_table_A(pagetext):
lines = pagetext.split("\n")
header = "NR WR SR ER NER TOTAL"
headers = header.strip().split()
data = {}
steps = chunk(lines, len(headers), 9)
next(steps)
for row in steps:
for h, d in zip(headers, row):
data.setdefault(h , []).append(d)
pd.DataFrame(data)
d = {}
d['x']
d.get('x', 0)
d
d.setdefault("x", 0)
d
import pandas as pd
def chunk(items, n, count):
"""
[i1, i2, i3, i4, i5, i6, i7 ....i100]
will break it into peices of size n
"""
s = 0
for i in range(count):
yield items[s:s+n] # n items from items, starting s position
s = (i+1)*n
def get_page(pdffile, pageno):
with open("demanddata.pdf", "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
page = pdfreader.getPage(pageno)
return page.extractText()
def extract_table_A(pagetext):
lines = pagetext.split("\n")
header = "NR WR SR ER NER TOTAL"
headers = header.strip().split()
data = {}
steps = chunk(lines, len(headers), 9)
next(steps)
for row in steps:
for h, d in zip(headers, row):
data.setdefault(h , []).append(d)
return pd.DataFrame(data)
extract_table_A(get_page("demanddata.pdf", 1))
%%file extract_tableA.py
"""this script allows extracting table from a pdf file. it assumes
certain format. tested with file https://posoco.in/download/16-07-20_nldc_psp/?wpdmdl=30215
"""
import pandas as pd
import PyPDF2
import typer
app = typer.Typer()
def chunk(items, n, count):
"""
[i1, i2, i3, i4, i5, i6, i7 ....i100]
will break it into peices of size n
"""
s = 0
for i in range(count):
yield items[s:s+n] # n items from items, starting s position
s = (i+1)*n
def get_page(pdffile, pageno):
with open("demanddata.pdf", "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
page = pdfreader.getPage(pageno)
return page.extractText()
def extract_table_A(pagetext):
lines = pagetext.split("\n")
header = "NR WR SR ER NER TOTAL"
headers = header.strip().split()
data = {}
steps = chunk(lines, len(headers), 9)
next(steps)
for row in steps:
for h, d in zip(headers, row):
data.setdefault(h , []).append(d)
return pd.DataFrame(data)
@app.command()
def extract_tableA(pdffile, csvfile):
"""
exctracts table A from pdffile and saves it in csvfile
"""
page = get_page(pdffile, 1)
df = extract_table_A(page)
df.to_csv(csvfile)
if __name__ == "__main__":
app()
ANy command line tool has elaborate command line options
!python extract_tableA.py --help
!python extract_tableA.py demanddata.pdf demanddata.csv
!pip install typer
!cat demanddata.csv
%%file head.py
import typer
app = typer.Typer()
@app.command()
def head(filename:str, n:int=5):
with open(filename) as f:
for i in range(n):
print(f.readline(), end="")
if __name__ == "__main__":
app()
!python head.py --help
!python head.py demanddata.csv
!python head.py --n 3 demanddata.csv
import datetime
datetime.datetime.today().strftime("%Y")
datetime.datetime.today()
datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
time = '2021-01-15 13:07:35'
datetime.datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
time1 = "2030/01/15"
datetime.datetime.strptime(time1, "%Y/%m/%d")
import re #regular expression module
multiplestring = """
fjdsaf hdsg kjfhdsf kjhfds ds
dhf kdsjh
def hello():
print("hello")
sadsad kjshdf jsfkjdhfs kjdshafkjhdsa f
kjhfds kd
kjhdsfkj
kjhd f
kkdjkfj
"""
empty = re.compile("^$") # empty line
ninechars = re.compile("^.........$")
p1 = re.compile("\d+.+") # one digits and many chARS
P2 = re.compile("^\d+$") # only digits one or more
p1.match("hello") # if there no match it will return None
p1.match("2kjfdkjf")
P2.match("fdfd")
P2.match("5")
P2.match("5556575")
s = "<c>text</c>"