Oct 11-15, 2021 Vikrant Patil
These notes are available online at https://notes.pipal.in/2021/arcesium_finop_batch2/
© Pipal Academy LLP
Day 1 | Day 2 | Day 3 | Day 4 | Day 5
We will be using jupyter hub from https://lab2.pipal.in for this training.
create a notebook with name module3-day3
Http protocols 4 methods
Webapi will usually have documentation about whetehr the url is got get/post/put/delete
import requests
API_KEY = "UKVFE0JLE0TBPDEF"
alphavantageurl = "https://www.alphavantage.co/query"
params = {
"function":"TIME_SERIES_INTRADAY",
"symbol":"AAPL",
"interval":"15min",
"apikey": API_KEY
}
resp = requests.get(alphavantageurl, params=params)
resp.status_code
200
data = resp.json()
type(data)
dict
data.keys()
dict_keys(['Meta Data', 'Time Series (15min)'])
data['Meta Data']
{'1. Information': 'Intraday (15min) open, high, low, close prices and volume',
'2. Symbol': 'AAPL',
'3. Last Refreshed': '2021-10-27 20:00:00',
'4. Interval': '15min',
'5. Output Size': 'Compact',
'6. Time Zone': 'US/Eastern'}
type(data['Time Series (15min)'])
dict
len(data['Time Series (15min)'])
100
data['Time Series (15min)'].keys()
dict_keys(['2021-10-27 20:00:00', '2021-10-27 19:45:00', '2021-10-27 19:30:00', '2021-10-27 19:15:00', '2021-10-27 19:00:00', '2021-10-27 18:45:00', '2021-10-27 18:30:00', '2021-10-27 18:15:00', '2021-10-27 18:00:00', '2021-10-27 17:45:00', '2021-10-27 17:30:00', '2021-10-27 17:15:00', '2021-10-27 17:00:00', '2021-10-27 16:45:00', '2021-10-27 16:30:00', '2021-10-27 16:15:00', '2021-10-27 16:00:00', '2021-10-27 15:45:00', '2021-10-27 15:30:00', '2021-10-27 15:15:00', '2021-10-27 15:00:00', '2021-10-27 14:45:00', '2021-10-27 14:30:00', '2021-10-27 14:15:00', '2021-10-27 14:00:00', '2021-10-27 13:45:00', '2021-10-27 13:30:00', '2021-10-27 13:15:00', '2021-10-27 13:00:00', '2021-10-27 12:45:00', '2021-10-27 12:30:00', '2021-10-27 12:15:00', '2021-10-27 12:00:00', '2021-10-27 11:45:00', '2021-10-27 11:30:00', '2021-10-27 11:15:00', '2021-10-27 11:00:00', '2021-10-27 10:45:00', '2021-10-27 10:30:00', '2021-10-27 10:15:00', '2021-10-27 10:00:00', '2021-10-27 09:45:00', '2021-10-27 09:30:00', '2021-10-27 09:15:00', '2021-10-27 09:00:00', '2021-10-27 08:45:00', '2021-10-27 08:30:00', '2021-10-27 08:15:00', '2021-10-27 08:00:00', '2021-10-27 07:45:00', '2021-10-27 07:30:00', '2021-10-27 07:15:00', '2021-10-27 07:00:00', '2021-10-27 06:45:00', '2021-10-27 06:30:00', '2021-10-27 06:15:00', '2021-10-27 06:00:00', '2021-10-27 05:45:00', '2021-10-27 05:30:00', '2021-10-27 05:15:00', '2021-10-27 05:00:00', '2021-10-27 04:45:00', '2021-10-27 04:30:00', '2021-10-27 04:15:00', '2021-10-26 20:00:00', '2021-10-26 19:45:00', '2021-10-26 19:30:00', '2021-10-26 19:15:00', '2021-10-26 19:00:00', '2021-10-26 18:45:00', '2021-10-26 18:30:00', '2021-10-26 18:15:00', '2021-10-26 18:00:00', '2021-10-26 17:45:00', '2021-10-26 17:30:00', '2021-10-26 17:15:00', '2021-10-26 17:00:00', '2021-10-26 16:45:00', '2021-10-26 16:30:00', '2021-10-26 16:15:00', '2021-10-26 16:00:00', '2021-10-26 15:45:00', '2021-10-26 15:30:00', '2021-10-26 15:15:00', '2021-10-26 15:00:00', '2021-10-26 14:45:00', '2021-10-26 14:30:00', '2021-10-26 14:15:00', '2021-10-26 14:00:00', '2021-10-26 13:45:00', '2021-10-26 13:30:00', '2021-10-26 13:15:00', '2021-10-26 13:00:00', '2021-10-26 12:45:00', '2021-10-26 12:30:00', '2021-10-26 12:15:00', '2021-10-26 12:00:00', '2021-10-26 11:45:00', '2021-10-26 11:30:00', '2021-10-26 11:15:00'])
data['Time Series (15min)']['2021-10-27 20:00:00']
{'1. open': '149.1000',
'2. high': '149.1400',
'3. low': '149.0000',
'4. close': '149.0700',
'5. volume': '36823'}
import pandas as pd
pd.DataFrame(data['Time Series (15min)'])
| 2021-10-27 20:00:00 | 2021-10-27 19:45:00 | 2021-10-27 19:30:00 | 2021-10-27 19:15:00 | 2021-10-27 19:00:00 | 2021-10-27 18:45:00 | 2021-10-27 18:30:00 | 2021-10-27 18:15:00 | 2021-10-27 18:00:00 | 2021-10-27 17:45:00 | ... | 2021-10-26 13:30:00 | 2021-10-26 13:15:00 | 2021-10-26 13:00:00 | 2021-10-26 12:45:00 | 2021-10-26 12:30:00 | 2021-10-26 12:15:00 | 2021-10-26 12:00:00 | 2021-10-26 11:45:00 | 2021-10-26 11:30:00 | 2021-10-26 11:15:00 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1. open | 149.1000 | 149.1500 | 149.1400 | 149.1700 | 149.1700 | 149.1000 | 149.1400 | 149.0900 | 148.9800 | 149.0000 | ... | 149.5300 | 149.4600 | 149.4900 | 149.6550 | 149.7200 | 149.6100 | 149.8600 | 149.9400 | 149.8500 | 149.3900 |
| 2. high | 149.1400 | 149.1699 | 149.1700 | 149.1700 | 149.1700 | 149.1400 | 149.1700 | 149.1501 | 149.1100 | 149.0000 | ... | 149.6450 | 149.5874 | 149.5300 | 149.7450 | 149.8590 | 149.7500 | 149.9100 | 149.9800 | 150.0499 | 149.9700 |
| 3. low | 149.0000 | 149.0200 | 149.0300 | 149.1100 | 149.1200 | 149.0900 | 149.0700 | 149.0900 | 148.9700 | 148.9300 | ... | 149.3800 | 149.2900 | 149.2000 | 149.3330 | 149.6300 | 149.3750 | 149.4601 | 149.4300 | 149.8300 | 149.3800 |
| 4. close | 149.0700 | 149.1400 | 149.1700 | 149.1400 | 149.1400 | 149.1400 | 149.1000 | 149.1400 | 149.0800 | 148.9400 | ... | 149.5800 | 149.5200 | 149.4604 | 149.4955 | 149.6562 | 149.7150 | 149.6050 | 149.8597 | 149.9400 | 149.8478 |
| 5. volume | 36823 | 29874 | 21690 | 14188 | 13809 | 10752 | 13468 | 26187 | 19196 | 9171 | ... | 1348407 | 1206561 | 1791171 | 1384721 | 1359349 | 2046436 | 2116729 | 2233785 | 1874156 | 2101928 |
5 rows × 100 columns
timeseries = pd.DataFrame(data['Time Series (15min)']).transpose()
timeseries
| 1. open | 2. high | 3. low | 4. close | 5. volume | |
|---|---|---|---|---|---|
| 2021-10-27 20:00:00 | 149.1000 | 149.1400 | 149.0000 | 149.0700 | 36823 |
| 2021-10-27 19:45:00 | 149.1500 | 149.1699 | 149.0200 | 149.1400 | 29874 |
| 2021-10-27 19:30:00 | 149.1400 | 149.1700 | 149.0300 | 149.1700 | 21690 |
| 2021-10-27 19:15:00 | 149.1700 | 149.1700 | 149.1100 | 149.1400 | 14188 |
| 2021-10-27 19:00:00 | 149.1700 | 149.1700 | 149.1200 | 149.1400 | 13809 |
| ... | ... | ... | ... | ... | ... |
| 2021-10-26 12:15:00 | 149.6100 | 149.7500 | 149.3750 | 149.7150 | 2046436 |
| 2021-10-26 12:00:00 | 149.8600 | 149.9100 | 149.4601 | 149.6050 | 2116729 |
| 2021-10-26 11:45:00 | 149.9400 | 149.9800 | 149.4300 | 149.8597 | 2233785 |
| 2021-10-26 11:30:00 | 149.8500 | 150.0499 | 149.8300 | 149.9400 | 1874156 |
| 2021-10-26 11:15:00 | 149.3900 | 149.9700 | 149.3800 | 149.8478 | 2101928 |
100 rows × 5 columns
API_KEY = "UKVFE0JLE0TBPDEF"
alphavantageurl = "https://www.alphavantage.co/query"
params = {
"function":"TIME_SERIES_INTRADAY",
"symbol":"AAPL",
"interval":"15min",
"apikey": API_KEY,
"datatype": "csv"
}
resp = requests.get(alphavantageurl, params=params)
with open("timeseries.csv", "w") as f:
f.write(resp.text)
df = pd.read_csv("timeseries.csv")
df
| timestamp | open | high | low | close | volume | |
|---|---|---|---|---|---|---|
| 0 | 2021-10-27 20:00:00 | 149.10 | 149.1400 | 149.0000 | 149.0700 | 36823 |
| 1 | 2021-10-27 19:45:00 | 149.15 | 149.1699 | 149.0200 | 149.1400 | 29874 |
| 2 | 2021-10-27 19:30:00 | 149.14 | 149.1700 | 149.0300 | 149.1700 | 21690 |
| 3 | 2021-10-27 19:15:00 | 149.17 | 149.1700 | 149.1100 | 149.1400 | 14188 |
| 4 | 2021-10-27 19:00:00 | 149.17 | 149.1700 | 149.1200 | 149.1400 | 13809 |
| ... | ... | ... | ... | ... | ... | ... |
| 95 | 2021-10-26 12:15:00 | 149.61 | 149.7500 | 149.3750 | 149.7150 | 2046436 |
| 96 | 2021-10-26 12:00:00 | 149.86 | 149.9100 | 149.4601 | 149.6050 | 2116729 |
| 97 | 2021-10-26 11:45:00 | 149.94 | 149.9800 | 149.4300 | 149.8597 | 2233785 |
| 98 | 2021-10-26 11:30:00 | 149.85 | 150.0499 | 149.8300 | 149.9400 | 1874156 |
| 99 | 2021-10-26 11:15:00 | 149.39 | 149.9700 | 149.3800 | 149.8478 | 2101928 |
100 rows × 6 columns
df.columns
Index(['timestamp', 'open', 'high', 'low', 'close', 'volume'], dtype='object')
df.timestamp
0 2021-10-27 20:00:00
1 2021-10-27 19:45:00
2 2021-10-27 19:30:00
3 2021-10-27 19:15:00
4 2021-10-27 19:00:00
...
95 2021-10-26 12:15:00
96 2021-10-26 12:00:00
97 2021-10-26 11:45:00
98 2021-10-26 11:30:00
99 2021-10-26 11:15:00
Name: timestamp, Length: 100, dtype: object
df['timestamp'] = pd.to_datetime(df['timestamp'])
df
| timestamp | open | high | low | close | volume | |
|---|---|---|---|---|---|---|
| 0 | 2021-10-27 20:00:00 | 149.10 | 149.1400 | 149.0000 | 149.0700 | 36823 |
| 1 | 2021-10-27 19:45:00 | 149.15 | 149.1699 | 149.0200 | 149.1400 | 29874 |
| 2 | 2021-10-27 19:30:00 | 149.14 | 149.1700 | 149.0300 | 149.1700 | 21690 |
| 3 | 2021-10-27 19:15:00 | 149.17 | 149.1700 | 149.1100 | 149.1400 | 14188 |
| 4 | 2021-10-27 19:00:00 | 149.17 | 149.1700 | 149.1200 | 149.1400 | 13809 |
| ... | ... | ... | ... | ... | ... | ... |
| 95 | 2021-10-26 12:15:00 | 149.61 | 149.7500 | 149.3750 | 149.7150 | 2046436 |
| 96 | 2021-10-26 12:00:00 | 149.86 | 149.9100 | 149.4601 | 149.6050 | 2116729 |
| 97 | 2021-10-26 11:45:00 | 149.94 | 149.9800 | 149.4300 | 149.8597 | 2233785 |
| 98 | 2021-10-26 11:30:00 | 149.85 | 150.0499 | 149.8300 | 149.9400 | 1874156 |
| 99 | 2021-10-26 11:15:00 | 149.39 | 149.9700 | 149.3800 | 149.8478 | 2101928 |
100 rows × 6 columns
df.info() # info method tells about basic information about columns
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 100 non-null datetime64[ns] 1 open 100 non-null float64 2 high 100 non-null float64 3 low 100 non-null float64 4 close 100 non-null float64 5 volume 100 non-null int64 dtypes: datetime64[ns](1), float64(4), int64(1) memory usage: 4.8 KB
resp = requests.post("https://httpbin.org/post", data={"input1":"x", "input2":"y"})
resp.status_code
200
resp.json()
{'args': {},
'data': '',
'files': {},
'form': {'input1': 'x', 'input2': 'y'},
'headers': {'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Content-Length': '17',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'httpbin.org',
'User-Agent': 'python-requests/2.24.0',
'X-Amzn-Trace-Id': 'Root=1-617a3601-24633c5a5cd717666f560652'},
'json': None,
'origin': '157.33.57.222',
'url': 'https://httpbin.org/post'}
user = "vikipedia"
pass_ = open("/tmp/pass.txt").read().strip()
resp = requests.get("http://api.github.com/user", auth=(user, pass_)) # simple user/password authentication
resp.status_code
401
resp.text
'{"message":"Requires authentication","documentation_url":"https://docs.github.com/rest/reference/users#get-the-authenticated-user"}'
There are different kinds of authentications, kerberos is one of those
pip install requests requests-kerberos
kerberos_auth=HTTPKerberosAuth(mutual_authentication=OPTIONAL)
response=requests.get(request_url, auth=kerberos_auth, params=params).json()
url = "http://www.thehindu.com"
respose = requests.get(url, params={"service":"rss"})
xmltext = respose.text
print(xmltext[:1500])
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>The Hindu - Home</title>
<link>https://www.thehindu.com/</link>
<description>Default RSS Feed</description>
<language>en-us</language>
<copyright>Copyright 2021 The Hindu</copyright>
<item>
<title><![CDATA[5 subs in football matches to be allowed on a permanent basis]]></title>
<author><![CDATA[AP]]></author>
<category><![CDATA[Football]]></category>
<link>https://www.thehindu.com/sport/football/5-subs-in-football-matches-to-be-allowed-on-a-permanent-basis/article37205294.ece</link>
<description><![CDATA[
The measure was introduced in May 2020 so teams could cope better with the congested schedule when football resumed following a suspension due to the coronavirus
]]></description>
<pubDate><![CDATA[Thu, 28 Oct 2021 10:53:19 +0530]]></pubDate>
</item>
<item>
<title><![CDATA[Facebook asks employees to preserve internal documents for legal inquiries]]></title>
<author><![CDATA[Reuters]]></author>
<category><![CDATA[Internet]]></category>
<link>https://www.thehindu.com/sci-tech/technology/internet/facebook-asks-employees-to-preserve-internal-documents-for-legal-inquiries/article37205072.ece</link>
<description><![CDATA[
The increased scrutin
<tag1>
<tag2>
<item>
<leaf>
text
</leaf>
</item>
</tag2>
</tag1>
from xml.etree import ElementTree as et
root = et.fromstring(xmltext)
items = root.findall(".//item")
items[0]
<Element 'item' at 0x7f3b44ecc3b0>
print(et.tostring(items[0]).decode())
<item>
<title>5 subs in football matches to be allowed on a permanent basis</title>
<author>AP</author>
<category>Football</category>
<link>https://www.thehindu.com/sport/football/5-subs-in-football-matches-to-be-allowed-on-a-permanent-basis/article37205294.ece</link>
<description>
The measure was introduced in May 2020 so teams could cope better with the congested schedule when football resumed following a suspension due to the coronavirus
</description>
<pubDate>Thu, 28 Oct 2021 10:53:19 +0530</pubDate>
</item>
for item in items[:10]:
print(item.findtext("title"))
print(item.findtext("author"))
print(item.findtext("link"))
print("="*30)
5 subs in football matches to be allowed on a permanent basis AP https://www.thehindu.com/sport/football/5-subs-in-football-matches-to-be-allowed-on-a-permanent-basis/article37205294.ece ============================== Facebook asks employees to preserve internal documents for legal inquiries Reuters https://www.thehindu.com/sci-tech/technology/internet/facebook-asks-employees-to-preserve-internal-documents-for-legal-inquiries/article37205072.ece ============================== Intel teams with Google Cloud to develop new class of data centre chip Reuters https://www.thehindu.com/sci-tech/technology/intel-teams-with-google-cloud-to-develop-new-class-of-data-centre-chip/article37205117.ece ============================== Manipur rights panel to begin inquiry into death of social activist Iboyaima Laithangbam https://www.thehindu.com/news/national/other-states/manipur-rights-panel-to-begin-inquiry-into-death-of-social-activist/article37205188.ece ============================== LaLiga | Barcelona sacks Koeman; 4 teams tied in Spain after Madrid draws AP https://www.thehindu.com/sport/football/laliga-barcelona-sacks-koeman-4-teams-tied-in-spain-after-madrid-draws/article37205161.ece ============================== Business live | Nykaa to launch IPO today https://www.thehindu.com/business/business-news-live-update-oct-28-2021/article37205119.ece ============================== Home Minister, DGP interact with COVID victims’ families Staff Reporter https://www.thehindu.com/news/national/andhra-pradesh/home-minister-dgp-interact-with-covid-victims-families/article37205034.ece ============================== Amazon seen triumphing over Apple privacy changes in digital ad business Reuters https://www.thehindu.com/sci-tech/technology/internet/amazon-seen-triumphing-over-apple-privacy-changes-in-digital-ad-business/article37204836.ece ============================== Samsung says component shortages to affect some chip demand, profit hits 3-year high Reuters https://www.thehindu.com/sci-tech/technology/samsung-says-component-shortages-to-affect-some-chip-demand-profit-hits-3-year-high/article37204744.ece ============================== TDP leaders confined to homes ahead of protest at Rushikonda Special Correspondent https://www.thehindu.com/news/national/andhra-pradesh/tdp-leaders-confined-to-homes-ahead-of-protest-at-rushikonda/article37205015.ece ==============================
def download(url, filename):
resp = requests.get(url)
with open(filename, "wb") as f:
f.write(resp.content) # resp.content is always binary
url = "https://raw.githubusercontent.com/vikipedia/python-trainings/master/online_course/source/module2/wallet.csv"
download(url, "csvdata.csv")
!python3 head.py csvdata.csv
,date,category,description,debit 0,2021-03-07 14:53:28.377359,Music,Amazon,421.2073272347991 1,2020-10-08 09:53:28.377359,Food,Swiggy,328.4400802428426 2,2021-02-23 09:53:28.377359,Books,Amazon,244.67943701511354 3,2020-11-01 14:53:28.377359,Utility,Phone,222.7563175805277
excelurl = "https://raw.githubusercontent.com/vikipedia/python-trainings/master/online_course/source/module2/wallet.xlsx"
download(excelurl, "exceldata.xlsx")
!file exceldata.xlsx
exceldata.xlsx: Microsoft Excel 2007+
def download_big(url, filename, chunksize=1024):
resp = requests.get(url)
with open(filename, "wb") as f:
for chunk in resp.iter_content(chunk_size=chunksize):
f.write(chunk)
print(".",end="")
download_big(excelurl, "exceldata.xlsx")
...........
url = "https://www.nseindia.com/get-quotes/equity?symbol=RELIANCE"
headers = {'Accept': '*/*',
'Accept-Language':'en-US,en;q=0.5',
'Host': 'www.nseindia.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0',
'X-Requested-With':'XMLHttpRequest'
}
url = "https://www.nseindia.com/get-quotes/equity"
resp = requests.get(url, params={"symbol":"RELIANCE"},headers=headers)
resp.status_code
200
print(resp.text[:100])
<!DOCTYPE html>
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; ch
import bs4
soup = bs4.BeautifulSoup(resp.content)
soup.findAll(attrs={'id':''})[0].text
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-85-b91a5b1718b7> in <module> ----> 1 soup.findAll(attrs={'id':'responseDiv'})[0].text IndexError: list index out of range
tables = pd.read_html(resp.text)
len(tables)
10
tables[0]
| Series | Symbol | ISIN | Status | Date of Listing | Industry | Symbol P/E | Sectoral Index P/E | Sectoral Index |
|---|
tables[2]
| Prev. Close | Open | Close* | VWAP | Lower Band | Upper Band | Price Band | Adjusted Price * |
|---|
tables[3]
| Qty | Bid (₹) | Ask (₹) | Qty.1 |
|---|
tables[4]
| Trade Information | Trade Information.1 | |
|---|---|---|
| 0 | Traded Volume (Shares) | - |
| 1 | Traded Value (₹ Lakhs) | - |
| 2 | Total Market Cap (₹ Lakhs) | - |
| 3 | Free Float Market Cap (₹ Lakhs) | - |
| 4 | Impact cost | - |
which you can try in your private virtual env
https://nsetools.readthedocs.io/en/latest/
pip install nsetools
from nsetool import Nse
nse = Nse()
nse.get_quote("RELIANCE")
nse.get_index_list()
nse.get_index_quote("NIFTY BANK")
nse.get_stock_codes()
nse.get_advances_declines()
we will make use of PyPDF2 library to read pdf files
!python3 -m pip install PyPDF2
import PyPDF2
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-93-69e9e6162b4c> in <module> ----> 1 import PyPDF2 ModuleNotFoundError: No module named 'PyPDF2'
!python3 -m pip install PyPDF2
Processing /home/vikrant/.cache/pip/wheels/b1/1a/8f/a4c34be976825a2f7948d0fa40907598d69834f8ab5889de11/PyPDF2-1.26.0-py3-none-any.whl Installing collected packages: PyPDF2 Successfully installed PyPDF2-1.26.0
import PyPDF2
we will make use of this pdf to write our sample code https://notes.pipal.in/2021/arcesium_finop_batch2/upload.pdf
download_big("https://notes.pipal.in/2021/arcesium_finop_batch2/upload.pdf", "upload.pdf")
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
with open("upload.pdf", "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
n = pdfreader.getNumPages()
for p in range(n):
page = pdfreader.getPage(p)
print(page.extractText()[:50])
National Load Despatch Centre POWER SYSTEM O NR WR SR ER NER TOTAL 59882 41115 34238 21526 2730 16-Jul-2020 Sl No Voltage Level Line Details Circ
with open("upload.pdf", "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
n = pdfreader.getNumPages()
page = pdfreader.getPage(1)
print(page.extractText()[:300])
NR WR SR ER NER TOTAL 59882 41115 34238 21526 2730 159491 1114 0 0 0 6 1120 1398 998 807 447 48 3698 355 33 77 149 29 643 11 49 128 - - 187 39.60 16.60 41.59 4.60 0.03 102 12.6 0.0 0.0 0.0 0.0 12.6 65470 43593 38117 21535 2827 160654 22:20 10:29 10:00 21:20 19:41 21:26 Region FVI < 49.7 49.7 - 49.8
def download_big(url, filename, chunksize=1024):
resp = requests.get(url)
with open(filename, "wb") as f:
for chunk in resp.iter_content(chunk_size=chunksize):
f.write(chunk)
print(".",end="")
def get_rows(lines, n, rowcount):
s = 0
rows = []
for i in range(rowcount):
rows.append(lines[s:s+n])
s = s+n
return rows
def extract_tableA(pagetext):
lines = pagetext.split("\n")
headers = """NR
WR
SR
ER
NER
TOTAL""".split("\n")
n = len(headers)
rowdata = get_rows(lines[n:], n, 9) # 9 is counted manually
return pd.DataFrame(rowdata, columns=headers)
page1 = pdfreader.getPage(1).extractText()
data = extract_tableA(page1)
data
| NR | WR | SR | ER | NER | TOTAL | |
|---|---|---|---|---|---|---|
| 0 | 59882 | 41115 | 34238 | 21526 | 2730 | 159491 |
| 1 | 1114 | 0 | 0 | 0 | 6 | 1120 |
| 2 | 1398 | 998 | 807 | 447 | 48 | 3698 |
| 3 | 355 | 33 | 77 | 149 | 29 | 643 |
| 4 | 11 | 49 | 128 | - | - | 187 |
| 5 | 39.60 | 16.60 | 41.59 | 4.60 | 0.03 | 102 |
| 6 | 12.6 | 0.0 | 0.0 | 0.0 | 0.0 | 12.6 |
| 7 | 65470 | 43593 | 38117 | 21535 | 2827 | 160654 |
| 8 | 22:20 | 10:29 | 10:00 | 21:20 | 19:41 | 21:26 |
def get_rows(lines, n, rowcount):
s = 0
rows = []
for i in range(rowcount):
rows.append(lines[s:s+n])
s = s+n
return rows
def extract_tableA(pagetext, numrows=9):
lines = pagetext.split("\n")
headers = """NR
WR
SR
ER
NER
TOTAL""".split("\n")
n = len(headers)
rowdata = get_rows(lines[n:], n, numrows) # 9 is counted manually
return pd.DataFrame(rowdata, columns=headers)
page1 = pdfreader.getPage(1).extractText()
extract_tableA(page1, 9)
| NR | WR | SR | ER | NER | TOTAL | |
|---|---|---|---|---|---|---|
| 0 | 59882 | 41115 | 34238 | 21526 | 2730 | 159491 |
| 1 | 1114 | 0 | 0 | 0 | 6 | 1120 |
| 2 | 1398 | 998 | 807 | 447 | 48 | 3698 |
| 3 | 355 | 33 | 77 | 149 | 29 | 643 |
| 4 | 11 | 49 | 128 | - | - | 187 |
| 5 | 39.60 | 16.60 | 41.59 | 4.60 | 0.03 | 102 |
| 6 | 12.6 | 0.0 | 0.0 | 0.0 | 0.0 | 12.6 |
| 7 | 65470 | 43593 | 38117 | 21535 | 2827 | 160654 |
| 8 | 22:20 | 10:29 | 10:00 | 21:20 | 19:41 | 21:26 |
lines = page1.split("\n")
"Total : 565656.54645 INR"
"TOTAL : 44343.432432 INR"
line = "this some line which has Total in it but no INR"
"Total" in line and "INR" in line
True
%%file extract_tableA.py
import PyPDF2
import sys
import pandas as pd
import typer
app = typer.Typer()
def get_rows(lines, n, rowcount):
s = 0
rows = []
for i in range(rowcount):
rows.append(lines[s:s+n])
s = s+n
return rows
def extract_tableA(pagetext):
lines = pagetext.split("\n")
headers = """NR
WR
SR
ER
NER
TOTAL""".split("\n")
n = len(headers)
rowdata = get_rows(lines[n:], n, 9) # 9 is counted manually
return pd.DataFrame(rowdata, columns=headers)
def get_pdfpreader(filename):
with open(filename, "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
return pdfreader
def extract_tableA_(filename):
with open(filename, "rb") as f:
pdfreader = PyPDF2.PdfFileReader(f)
page1 = pdfreader.getPage(1).extractText()
data = extract_tableA(page1)
return data
@app.command()
def extract_to_csv(filename:str, outputfile:str="tableA.csv"):
data = extract_tableA_(filename)
data.to_csv(outputfile, index=False)
if __name__ == "__main__":
app()
Overwriting extract_tableA.py
!python3 extract_tableA.py --help
Usage: extract_tableA.py [OPTIONS] FILENAME
Arguments:
FILENAME [required]
Options:
--outputfile TEXT [default: tableA.csv]
--install-completion [bash|zsh|fish|powershell|pwsh]
Install completion for the specified shell.
--show-completion [bash|zsh|fish|powershell|pwsh]
Show completion for the specified shell, to
copy it or customize the installation.
--help Show this message and exit.
!python3 extract_tableA.py upload.pdf --outputfile A.csv
!cat A.csv
NR,WR,SR,ER,NER,TOTAL 59882,41115,34238,21526,2730,159491 1114,0,0,0,6,1120 1398,998,807,447,48,3698 355,33,77,149,29,643 11,49,128,-,-,187 39.60,16.60,41.59,4.60,0.03,102 12.6,0.0,0.0,0.0,0.0,12.6 65470,43593,38117,21535,2827,160654 22:20,10:29,10:00,21:20,19:41,21:26
import csv
help(csv.writer)
Help on built-in function writer in module _csv:
writer(...)
csv_writer = csv.writer(fileobj [, dialect='excel']
[optional keyword args])
for row in sequence:
csv_writer.writerow(row)
[or]
csv_writer = csv.writer(fileobj [, dialect='excel']
[optional keyword args])
csv_writer.writerows(rows)
The "fileobj" argument can be any object that supports the file API.