Sep 13-17, 2021 Vikrant Patil
These notes are available online at https://notes.pipal.in/2021/arcesium_finop_batch1/
© Pipal Academy LLP
Day 1 | Day 2 | Day 3 | Day 4 | Day 5
We will be using jupyter hub from https://lab.pipal.in for this training.
login to hub and create a notebook with name module3-day4
url = "https://finance.yahoo.com/quote/{TICKER}/history?p={TICKER}"
url.format(TICKER="IBM")
'https://finance.yahoo.com/quote/IBM/history?p=IBM'
import requests
resp = requests.get(url.format(TICKER="IBM"), headers={'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:93.0) Gecko/20100101 Firefox/93.0"})
resp.status_code
403
print(resp.text)
<!DOCTYPE html>
<html lang="en-us"><head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<meta charset="utf-8">
<title>Yahoo</title>
<meta name="viewport" content="width=device-width,initial-scale=1,minimal-ui">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<style>
html {
height: 100%;
}
body {
background: #fafafc url(https://s.yimg.com/nn/img/sad-panda-201402200631.png) 50% 50%;
background-size: cover;
height: 100%;
text-align: center;
font: 300 18px "helvetica neue", helvetica, verdana, tahoma, arial, sans-serif;
}
table {
height: 100%;
width: 100%;
table-layout: fixed;
border-collapse: collapse;
border-spacing: 0;
border: none;
}
h1 {
font-size: 42px;
font-weight: 400;
color: #400090;
}
p {
color: #1A1A1A;
}
#message-1 {
font-weight: bold;
margin: 0;
}
#message-2 {
display: inline-block;
*display: inline;
zoom: 1;
max-width: 17em;
_width: 17em;
}
</style>
<script>
document.write('<img src="//geo.yahoo.com/b?s=1197757129&t='+new Date().getTime()+'&src=aws&err_url='+encodeURIComponent(document.URL)+'&err=%<pssc>&test='+encodeURIComponent('%<{Bucket}cqh[:200]>')+'" width="0px" height="0px"/>');var beacon = new Image();beacon.src="//bcn.fp.yahoo.com/p?s=1197757129&t="+new Date().getTime()+"&src=aws&err_url="+encodeURIComponent(document.URL)+"&err=%<pssc>&test="+encodeURIComponent('%<{Bucket}cqh[:200]>');
</script>
</head>
<body>
<!-- status code : 404 -->
<!-- Not Found on Server -->
<table>
<tbody><tr>
<td>
<img src="https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_p_205x58_frontpage.png" alt="Yahoo Logo">
<h1 style="margin-top:20px;">Will be right back...</h1>
<p id="message-1">Thank you for your patience.</p>
<p id="message-2">Our engineers are working quickly to resolve the issue.</p>
</td>
</tr>
</tbody></table>
</body></html>
baseurl = "https://yfapi.net"
quoteurl = "/v6/finance/quote"
url = baseurl + quoteurl
params = {
"region":"US",
"lang":"en",
"symbols":"AAPL"
}
resp = requests.get(url, params=params)
resp.status_code
403
import requests
url = "https://rest.yahoofinanceapi.com/v6/finance/quote"
querystring = {"symbols":"AAPL,BTC-USD,EURUSD=X"}
headers = {
'x-api-key': "YOUR-PERSONAL-API-KEY"
}
response = requests.get(url, headers=headers, params=querystring)
print(response.text)
jsondata = requests.get("https://httpbin.org/get?key1=value1&key2=value2&myparam=myvalue").json()
jsondata['headers']
{'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Host': 'httpbin.org',
'User-Agent': 'python-requests/2.24.0',
'X-Amzn-Trace-Id': 'Root=1-617244f3-5293488d65b7036a10e1bce5'}
%%file yahoo_fianace.py
from selenium import webdriver
from typer
app = typer.Typer()
def get_url(ticker):
return "https://finance.yahoo.com/quote/{0}/history?p={0}".format(ticker)
@app.command()
def download(ticker="AAPL")
fp = webdriver.FirefoxProfile()
xls_mimetype = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", xls_mimetype)
driver = webdriver.Firefox(firefox_profile=fp)
driver.implicitly_wait(5)
url = get_url(ticker)
driver.get(url)
download = driver.find_element_by_link_text("Download")
download.click()
driver.close()
if __name__ == "__main__":
app()
Writing yahoo_fianace.py
"this is string formating {var1}, {var2}".format(var1=2, var2=3)
'this is string formating 2, 3'
x = 90
y = 100
f"this is {x} and this is {y}"
'this is 90 and this is 100'
import sqlite3
conn = sqlite3.connect("data1.db") # this will vary depending on what database you are trying to connect with
cur = conn.cursor()
cur.execute("create table person (name varchar(100), email varchar(100))")
conn.commit()
def insert_person(name, email, conn):
cur = conn.cursor()
cur.execute(f"insert into person (name, email) values ('{name}', '{email}')")
conn.commit() # make sure you commit when you are doing write or change operation.
insert_person("alice", "aice@wonder.land", conn)
insert_person("alex", "alex@newyork.zoo", conn)
def find_person(name, conn):
query = f"select * from person where name='{name}'"
print(query)
cur = conn.cursor()
results = cur.execute(query)
return results
r= find_person("alex", conn) # iterator ... so you can use it only once
select * from person where name='alex'
r.fetchall() # consumed ..
[('alex', 'alex@newyork.zoo')]
r.fetchone() # already consumed, will not get any result
r = find_person("alice", conn)
select * from person where name='alice'
r.fetchone() # it get first item from the results..being just one item it is tuple
('alice', 'aice@wonder.land')
r = find_person("alice", conn)
r.fetchall() # it returns list of tuples... every tuple is one row from database
select * from person where name='alice'
[('alice', 'aice@wonder.land')]
(1, 2, 3)
(1, 2, 3)
(1,) # a tuple with single item in it
(1,)
def find_person_(name, conn):
q = "select * from person where name=?"
cur = conn.cursor()
result = cur.execute(q, (name, )) # this is right way of doing
return result.fetchall()
find_person_("alice", conn)
[('alice', 'aice@wonder.land')]
find_person("alice", conn).fetchall()
select * from person where name='alice'
[('alice', 'aice@wonder.land')]
find_person_("alice", conn)
[('alice', 'aice@wonder.land')]
conn.close()
import pandas as pd
conn = sqlite3.connect("data1")
wallet = pd.read_csv("wallet.csv", index_col=0)
wallet
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... |
| 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 4 columns
wallet.to_sql("wallet", con=conn, if_exists="append")
r= conn.cursor().execute("select * from wallet")
for item in r:
print(item)
(0, '2021-03-07 14:53:28.377359', 'Music', 'Amazon', 421.2073272347991) (1, '2020-10-08 09:53:28.377359', 'Food', 'Swiggy', 328.4400802428426) (2, '2021-02-23 09:53:28.377359', 'Books', 'Amazon', 244.67943701511356) (3, '2020-11-01 14:53:28.377359', 'Utility', 'Phone', 222.75631758052768) (4, '2021-06-05 13:53:28.377359', 'Books', 'Flipcart', 494.1284923793595) (5, '2021-07-28 19:53:28.377359', 'Utility', 'Electricity', 219.9417113096841) (6, '2021-04-16 11:53:28.377359', 'Books', 'Amazon Kindle', 270.32259514795845) (7, '2021-02-15 10:53:28.377359', 'Food', 'Zomato', 457.1831036346536) (8, '2021-08-10 19:53:28.377359', 'Utility', 'Phone', 151.4963725994779) (9, '2020-11-29 14:53:28.377359', 'Travel', 'Auto', 443.61888423247854) (10, '2021-06-15 13:53:28.377359', 'Travel', 'Metro', 328.1754210974373) (11, '2021-07-24 13:53:28.377359', 'Food', 'Zomato', 434.4954675355444) (12, '2021-07-24 14:53:28.377359', 'Music', 'Amazon', 329.5360031897569) (13, '2021-06-06 10:53:28.377359', 'Utility', 'Phone', 154.04494918166588) (14, '2021-06-09 13:53:28.377359', 'Travel', 'Taxi', 485.2977429821982) (15, '2021-08-24 17:53:28.377359', 'Food', 'Zomato', 262.9439932340398) (16, '2021-03-05 19:53:28.377359', 'Utility', 'Phone', 390.31687619327926) (17, '2021-04-17 18:53:28.377359', 'Utility', 'Electricity', 316.87867542466364) (18, '2021-05-08 15:53:28.377359', 'Travel', 'Auto', 433.8224042777937) (19, '2021-05-16 10:53:28.377359', 'Books', 'Flipcart', 109.32590886550068) (20, '2020-10-12 18:53:28.377359', 'Travel', 'Auto', 365.9218082537661) (21, '2021-01-04 19:53:28.377359', 'Travel', 'Metro', 329.0973715025852) (22, '2021-06-24 15:53:28.377359', 'Food', 'Zomato', 489.14348305222535) (23, '2020-12-11 10:53:28.377359', 'Music', 'Netflix', 354.9402409919817) (24, '2021-05-31 11:53:28.377359', 'Books', 'Amazon', 498.1004955046106) (25, '2021-05-21 14:53:28.377359', 'Food', 'Hotel', 483.315863517772) (26, '2020-08-26 15:53:28.377359', 'Books', 'Amazon Kindle', 138.806577801854) (27, '2021-05-01 15:53:28.377359', 'Utility', 'Electricity', 103.68079074846584) (28, '2020-12-14 15:53:28.377359', 'Utility', 'Phone', 358.4599327957656) (29, '2021-06-20 10:53:28.377359', 'Utility', 'Electricity', 184.5577284049955) (30, '2020-09-15 18:53:28.377359', 'Food', 'Swiggy', 203.52923978943267) (31, '2020-09-25 11:53:28.377359', 'Books', 'Flipcart', 246.503527384528) (32, '2021-06-23 11:53:28.377359', 'Food', 'Zomato', 345.0304360814152) (33, '2021-05-14 18:53:28.377359', 'Food', 'Hotel', 449.24802955761743) (34, '2021-05-14 10:53:28.377359', 'Utility', 'Phone', 499.8581815222449) (35, '2021-02-18 18:53:28.377359', 'Travel', 'Metro', 441.6021430011205) (36, '2020-12-10 10:53:28.377359', 'Travel', 'Auto', 472.94143917262176) (37, '2021-04-18 16:53:28.377359', 'Music', 'Amazon', 266.0690783774673) (38, '2021-08-15 10:53:28.377359', 'Travel', 'Auto', 494.12439940565713) (39, '2021-05-17 17:53:28.377359', 'Food', 'Swiggy', 112.33316019807457) (40, '2021-07-19 12:53:28.377359', 'Food', 'Swiggy', 291.54598801930536) (41, '2021-02-20 19:53:28.377359', 'Utility', 'Phone', 425.18719068071806) (42, '2021-08-22 17:53:28.377359', 'Food', 'Hotel', 210.25626950078566) (43, '2020-09-21 12:53:28.377359', 'Utility', 'Phone', 486.0339327616073) (44, '2020-12-26 19:53:28.377359', 'Utility', 'Electricity', 257.92759337085425) (45, '2021-05-27 16:53:28.377359', 'Utility', 'Electricity', 154.74287259516657) (46, '2021-05-15 15:53:28.377359', 'Utility', 'Electricity', 359.3249716537848) (47, '2020-10-28 10:53:28.377359', 'Books', 'Flipcart', 310.408610004679) (48, '2021-08-23 17:53:28.377359', 'Utility', 'Electricity', 310.05840961423314) (49, '2021-03-16 09:53:28.377359', 'Music', 'spotify', 232.30340219121135) (50, '2020-12-24 11:53:28.377359', 'Food', 'Zomato', 463.0018749263554) (51, '2020-12-22 17:53:28.377359', 'Food', 'Zomato', 331.22702332837093) (52, '2021-03-26 09:53:28.377359', 'Travel', 'Taxi', 403.61007013419334) (53, '2021-01-27 09:53:28.377359', 'Utility', 'Electricity', 183.1866624101276) (54, '2020-11-16 10:53:28.377359', 'Music', 'spotify', 160.81754340768396) (55, '2021-01-21 19:53:28.377359', 'Books', 'Flipcart', 423.74970808720553) (56, '2021-05-19 18:53:28.377359', 'Utility', 'Phone', 319.34287626846185) (57, '2021-07-15 15:53:28.377359', 'Utility', 'Phone', 279.6090437716363) (58, '2021-05-20 10:53:28.377359', 'Food', 'Hotel', 255.8710346734312) (59, '2020-08-28 11:53:28.377359', 'Food', 'Swiggy', 208.2329120852039) (60, '2021-01-17 11:53:28.377359', 'Utility', 'Electricity', 382.5195101154448) (61, '2021-02-25 13:53:28.377359', 'Food', 'Hotel', 124.65827844174062) (62, '2021-01-27 19:53:28.377359', 'Books', 'Amazon Kindle', 497.7708601564023) (63, '2021-05-10 11:53:28.377359', 'Travel', 'Taxi', 355.9890502253258) (64, '2021-01-31 14:53:28.377359', 'Food', 'Zomato', 232.2223798622789) (65, '2020-10-23 18:53:28.377359', 'Music', 'Netflix', 188.7487426895118) (66, '2020-10-09 16:53:28.377359', 'Food', 'Swiggy', 263.9577700340145) (67, '2021-07-31 14:53:28.377359', 'Music', 'Netflix', 324.786916846731) (68, '2020-08-26 09:53:28.377359', 'Travel', 'Taxi', 279.1478844739421) (69, '2020-10-10 15:53:28.377359', 'Utility', 'Electricity', 300.5246204193512) (70, '2021-08-17 13:53:28.377359', 'Utility', 'Phone', 125.22977317126335) (71, '2021-03-30 12:53:28.377359', 'Food', 'Swiggy', 245.36050838040904) (72, '2021-06-30 18:53:28.377359', 'Books', 'Amazon', 294.6628689900488) (73, '2021-08-15 17:53:28.377359', 'Travel', 'Metro', 117.58872931045572) (74, '2021-03-20 11:53:28.377359', 'Travel', 'Taxi', 303.0554209852045) (75, '2021-03-03 12:53:28.377359', 'Food', 'Hotel', 425.62529099481486) (76, '2020-11-17 09:53:28.377359', 'Music', 'Netflix', 197.5346000167895) (77, '2021-01-18 14:53:28.377359', 'Books', 'Amazon Kindle', 482.1523430204321) (78, '2020-09-09 16:53:28.377359', 'Music', 'spotify', 415.3728938035302) (79, '2021-08-17 09:53:28.377359', 'Music', 'Netflix', 321.7634156544651) (80, '2021-02-17 09:53:28.377359', 'Food', 'Swiggy', 283.09570727160764) (81, '2020-10-29 16:53:28.377359', 'Food', 'Hotel', 470.0809953992362) (82, '2020-09-22 09:53:28.377359', 'Music', 'spotify', 411.14270120842224) (83, '2021-03-18 09:53:28.377359', 'Books', 'Flipcart', 451.5844070294999) (84, '2020-09-21 10:53:28.377359', 'Music', 'Netflix', 158.7936457269333) (85, '2021-01-12 09:53:28.377359', 'Music', 'Amazon', 130.37490757527002) (86, '2021-05-07 16:53:28.377359', 'Food', 'Zomato', 198.45067179263802) (87, '2021-05-19 15:53:28.377359', 'Food', 'Zomato', 378.82064134052473) (88, '2021-04-18 09:53:28.377359', 'Utility', 'Phone', 124.22124784445779) (89, '2021-04-12 14:53:28.377359', 'Music', 'Amazon', 218.487173429263) (90, '2020-12-01 14:53:28.377359', 'Music', 'Amazon', 101.57327588889416) (91, '2021-01-22 17:53:28.377359', 'Food', 'Hotel', 232.66346838787223) (92, '2021-01-12 19:53:28.377359', 'Travel', 'Taxi', 356.84263798863265) (93, '2021-01-11 09:53:28.377359', 'Utility', 'Electricity', 111.72080867898062) (94, '2021-01-04 13:53:28.377359', 'Utility', 'Phone', 431.1855366816298) (95, '2021-07-19 13:53:28.377359', 'Utility', 'Phone', 388.67121323884214) (96, '2021-01-12 19:53:28.377359', 'Books', 'Flipcart', 467.5545618966052) (97, '2021-03-25 11:53:28.377359', 'Utility', 'Phone', 320.78943360123816) (98, '2021-05-13 15:53:28.377359', 'Travel', 'Taxi', 442.0964693975505) (99, '2020-10-11 16:53:28.377359', 'Food', 'Hotel', 100.45550129902664)
df = pd.read_sql_query("select * from wallet" , con=conn)
df
| index | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... | ... |
| 95 | 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 5 columns
import PyPDF2
with open("/home/vikrant/Downloads/upload.pdf", "rb") as f: # make note of binary mode (b)
pdfreader = PyPDF2.PdfFileReader(f)
n = pdfreader.getNumPages()
for p in range(n):
page = pdfreader.getPage(p)
print(page.extractText()[:50])
National Load Despatch Centre POWER SYSTEM O NR WR SR ER NER TOTAL 59882 41115 34238 21526 2730 16-Jul-2020 Sl No Voltage Level Line Details Circ
with open("/home/vikrant/Downloads/upload.pdf", "rb") as f: # make note of binary mode (b)
pdfreader = PyPDF2.PdfFileReader(f)
n = pdfreader.getNumPages()
page = pdfreader.getPage(1)
print(page.extractText()[:300])
NR WR SR ER NER TOTAL 59882 41115 34238 21526 2730 159491 1114 0 0 0 6 1120 1398 998 807 447 48 3698 355 33 77 149 29 643 11 49 128 - - 187 39.60 16.60 41.59 4.60 0.03 102 12.6 0.0 0.0 0.0 0.0 12.6 65470 43593 38117 21535 2827 160654 22:20 10:29 10:00 21:20 19:41 21:26 Region FVI < 49.7 49.7 - 49.8
import pandas as pd
def get_rows(lines, n, numrow):
s = 0
rows = []
for i in range(numrow):
rows.append(lines[s:s+n])
s = s+n
return rows
def column(data2d, n):
return [row[n] for row in data2d]
def transpose(data2d):
numcols = len(data2d[0])
return [column(data2d, i) for i in range(numcols)]
def extract_tableA(page):
lines = page.split("\n")
header = "NR WR SR ER NER TOTAL".split()
rowdata = get_rows(lines[len(header):], len(header), 9) # we counted manually that there 9 rows
columndata = transpose(rowdata)
data = dict(zip(header, columndata))
return pd.DataFrame(data)
extract_tableA(pdfreader.getPage(1).extractText())
| NR | WR | SR | ER | NER | TOTAL | |
|---|---|---|---|---|---|---|
| 0 | 59882 | 41115 | 34238 | 21526 | 2730 | 159491 |
| 1 | 1114 | 0 | 0 | 0 | 6 | 1120 |
| 2 | 1398 | 998 | 807 | 447 | 48 | 3698 |
| 3 | 355 | 33 | 77 | 149 | 29 | 643 |
| 4 | 11 | 49 | 128 | - | - | 187 |
| 5 | 39.60 | 16.60 | 41.59 | 4.60 | 0.03 | 102 |
| 6 | 12.6 | 0.0 | 0.0 | 0.0 | 0.0 | 12.6 |
| 7 | 65470 | 43593 | 38117 | 21535 | 2827 | 160654 |
| 8 | 22:20 | 10:29 | 10:00 | 21:20 | 19:41 | 21:26 |
tables = [[i*j for i in range(1, 11)] for j in range(1, 6)]
tables
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], [3, 6, 9, 12, 15, 18, 21, 24, 27, 30], [4, 8, 12, 16, 20, 24, 28, 32, 36, 40], [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]]
tables[2]
[3, 6, 9, 12, 15, 18, 21, 24, 27, 30]
tables[3]
[4, 8, 12, 16, 20, 24, 28, 32, 36, 40]
tables[0][2]
3
tables[1][2]
6
tables[2][2]
9
[row[2] for row in tables]
[3, 6, 9, 12, 15]
def column(data2d, n):
return [row[n] for row in data2d]
[column(tables, i) for i in range(10)]
[[1, 2, 3, 4, 5], [2, 4, 6, 8, 10], [3, 6, 9, 12, 15], [4, 8, 12, 16, 20], [5, 10, 15, 20, 25], [6, 12, 18, 24, 30], [7, 14, 21, 28, 35], [8, 16, 24, 32, 40], [9, 18, 27, 36, 45], [10, 20, 30, 40, 50]]
tables
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], [3, 6, 9, 12, 15, 18, 21, 24, 27, 30], [4, 8, 12, 16, 20, 24, 28, 32, 36, 40], [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]]