Python Virtual Training For Arcesium - Module III - Day 2¶

Dec 17-23, 2020 Vikrant Patil

These notes are available online at http://notes.pipal.in/2020/arcesium_finop_batch3/module3-day2.html

We will be using jupyter hub from http://lab.pipal.in for this training. Create a notebook with name module3-day2.ipynb for today's session. Before you start shutdown all kernels except today's notebook.

import pandas as pd

wallet = pd.read_csv("wallet.csv")

wallet

wallet[wallet.category=="Music"]

wallet[wallet.debit > 200]

wallet[wallet.category=="Music"][wallet.debit>300] # category -> Music and amount > 300

<ipython-input-7-72b0c702cf8b>:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  wallet[wallet.category=="Music"][wallet.debit>300] # category -> Music and amount > 300

music = wallet[wallet.category=="Music"]

music

music[music.debit>300]

wallet[(wallet.description=="Amazon") & (wallet.category=='Music')]

wallet[(wallet.description=="Netflix") & (wallet.category=='Music')]

wallet[(wallet.description=="Netflix") | (wallet.description=='Amazon')]

def total_expenditure(wallet, category):
    df_category = wallet[wallet.category==category]
    debit = df_category.debit
    return debit.sum()

total_expenditure(wallet, "Books")

4929.750393283798

total_expenditure(wallet, "Music")

4233.451868232711

total_expenditure(wallet, "Food")

8281.189172581233

wallet.category.unique()

array(['Music', 'Food', 'Books', 'Utility', 'Travel'], dtype=object)

for cat in wallet.category.unique():
    print("Expenditure for {} is".format(cat),total_expenditure(wallet, cat))

Expenditure for Music is 4233.451868232711
Expenditure for Food is 8281.189172581233
Expenditure for Books is 4929.750393283798
Expenditure for Utility is 7562.267232638567
Expenditure for Travel is 6052.931876440963

wallet.groupby("category").mean() # average for every numeric column grouped by category

wallet.groupby("category").sum()

wallet.groupby("category").max()

wallet.date

0     2021-03-07 14:53:28.377359
1     2020-10-08 09:53:28.377359
2     2021-02-23 09:53:28.377359
3     2020-11-01 14:53:28.377359
4     2021-06-05 13:53:28.377359
                 ...            
95    2021-07-19 13:53:28.377359
96    2021-01-12 19:53:28.377359
97    2021-03-25 11:53:28.377359
98    2021-05-13 15:53:28.377359
99    2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: object

import datetime

today = datetime.datetime.now()

today

datetime.datetime(2021, 1, 12, 11, 22, 43, 36486)

tommorrow = today + datetime.timedelta(days=1)

tommorrow

datetime.datetime(2021, 1, 13, 11, 22, 43, 36486)

tommorrow > today

True

today > tommorrow

False

pd.to_datetime(wallet.date)

0    2021-03-07 14:53:28.377359
1    2020-10-08 09:53:28.377359
2    2021-02-23 09:53:28.377359
3    2020-11-01 14:53:28.377359
4    2021-06-05 13:53:28.377359
                ...            
95   2021-07-19 13:53:28.377359
96   2021-01-12 19:53:28.377359
97   2021-03-25 11:53:28.377359
98   2021-05-13 15:53:28.377359
99   2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: datetime64[ns]

wallet.date

0     2021-03-07 14:53:28.377359
1     2020-10-08 09:53:28.377359
2     2021-02-23 09:53:28.377359
3     2020-11-01 14:53:28.377359
4     2021-06-05 13:53:28.377359
                 ...            
95    2021-07-19 13:53:28.377359
96    2021-01-12 19:53:28.377359
97    2021-03-25 11:53:28.377359
98    2021-05-13 15:53:28.377359
99    2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: object

wallet['date'] = pd.to_datetime(wallet.date)

wallet.date

0    2021-03-07 14:53:28.377359
1    2020-10-08 09:53:28.377359
2    2021-02-23 09:53:28.377359
3    2020-11-01 14:53:28.377359
4    2021-06-05 13:53:28.377359
                ...            
95   2021-07-19 13:53:28.377359
96   2021-01-12 19:53:28.377359
97   2021-03-25 11:53:28.377359
98   2021-05-13 15:53:28.377359
99   2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: datetime64[ns]

wallet.date.min()

Timestamp('2020-08-26 09:53:28.377359')

wallet.date.max()

Timestamp('2021-08-24 17:53:28.377359')

today

datetime.datetime(2021, 1, 12, 11, 22, 43, 36486)

wallet[wallet.date < today]

wallet.to_csv("wallet_dataframe.csv")

!head wallet_dataframe.csv

,Unnamed: 0,date,category,description,debit
0,0,2021-03-07 14:53:28.377359,Music,Amazon,421.2073272347991
1,1,2020-10-08 09:53:28.377359,Food,Swiggy,328.4400802428426
2,2,2021-02-23 09:53:28.377359,Books,Amazon,244.67943701511356
3,3,2020-11-01 14:53:28.377359,Utility,Phone,222.75631758052768
4,4,2021-06-05 13:53:28.377359,Books,Flipcart,494.1284923793595
5,5,2021-07-28 19:53:28.377359,Utility,Electricity,219.9417113096841
6,6,2021-04-16 11:53:28.377359,Books,Amazon Kindle,270.32259514795845
7,7,2021-02-15 10:53:28.377359,Food,Zomato,457.1831036346536
8,8,2021-08-10 19:53:28.377359,Utility,Phone,151.4963725994779

df = wallet[['date','category','description','debit']]

df.to_csv("wallet_dataframe1.csv")

!head wallet_dataframe1.csv

,date,category,description,debit
0,2021-03-07 14:53:28.377359,Music,Amazon,421.2073272347991
1,2020-10-08 09:53:28.377359,Food,Swiggy,328.4400802428426
2,2021-02-23 09:53:28.377359,Books,Amazon,244.67943701511356
3,2020-11-01 14:53:28.377359,Utility,Phone,222.75631758052768
4,2021-06-05 13:53:28.377359,Books,Flipcart,494.1284923793595
5,2021-07-28 19:53:28.377359,Utility,Electricity,219.9417113096841
6,2021-04-16 11:53:28.377359,Books,Amazon Kindle,270.32259514795845
7,2021-02-15 10:53:28.377359,Food,Zomato,457.1831036346536
8,2021-08-10 19:53:28.377359,Utility,Phone,151.4963725994779

df.to_csv("wallet_dataframe1.csv", index=False)

!head wallet_dataframe1.csv

date,category,description,debit
2021-03-07 14:53:28.377359,Music,Amazon,421.2073272347991
2020-10-08 09:53:28.377359,Food,Swiggy,328.4400802428426
2021-02-23 09:53:28.377359,Books,Amazon,244.67943701511356
2020-11-01 14:53:28.377359,Utility,Phone,222.75631758052768
2021-06-05 13:53:28.377359,Books,Flipcart,494.1284923793595
2021-07-28 19:53:28.377359,Utility,Electricity,219.9417113096841
2021-04-16 11:53:28.377359,Books,Amazon Kindle,270.32259514795845
2021-02-15 10:53:28.377359,Food,Zomato,457.1831036346536
2021-08-10 19:53:28.377359,Utility,Phone,151.4963725994779

help(df.to_excel)

pd.read_csv("https://raw.githubusercontent.com/vikipedia/python-trainings/master/online_course/source/module2/wallet.csv")

Take home assignment¶

look at help of DataFrame.to_excel and write wallet dataset into an excel workbook in a sheet with name 'wallet'

Combining dataframes¶

df1 = pd.DataFrame(
    {"a":[1, 2, 3, 4, 5],
    "b":[43, 54, 23, 65, 34],
    "labels" : ['x','y','z','m','n']}
)

df1

df2 = pd.DataFrame(
    {"c": [1, 2, 3, 4],
    'd': [34,56, 56, 45],
    'labels' : ['x','y','z','m']}
)

df2

pd.merge(df2, df1, on='labels')

stocks = pd.DataFrame(
    {
    "value": [123, 130, 140, 150],
    "high":[125, 135, 145, 155],
    "low":[120, 125, 138, 149],
    "volume":[10, 101, 10, 20]},
    index = ['APPLE','AT&T','AGILENT','HP']
)

stocks

stocks1 = pd.DataFrame(
    {
    "value": [123, 130],
    "high":[125, 135],
    "low":[120, 125],
    "volume":[10, 101]},
    index = ['XEROX','TESLA']
)

stocks1

pd.concat([stocks, stocks1])

dfj1 = pd.DataFrame(
    {"a":[1,2,3,4,5],
    "b":[32, 34, 23, 45 ,12]},
    index = ['x','y','z','m','n']
)

dfj2 = pd.DataFrame(
    {"c":[5, 6, 7, 8],
    "d":[65,656,67,67]},
    index = ['x','y','z','m',]
)

dfj1

dfj2

dfj1.join(dfj2)

str operations¶

wallet = pd.read_csv("https://raw.githubusercontent.com/vikipedia/python-trainings/master/online_course/source/module2/wallet.csv")

wallet.date

0     2021-03-07 14:53:28.377359
1     2020-10-08 09:53:28.377359
2     2021-02-23 09:53:28.377359
3     2020-11-01 14:53:28.377359
4     2021-06-05 13:53:28.377359
                 ...            
95    2021-07-19 13:53:28.377359
96    2021-01-12 19:53:28.377359
97    2021-03-25 11:53:28.377359
98    2021-05-13 15:53:28.377359
99    2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: object

s = "2021-03-07 14:53:28.377359"

date, timestamp = s.split()

date

'2021-03-07'

date.split("-")[0]

'2021'

wallet.date.str.split()

0     [2021-03-07, 14:53:28.377359]
1     [2020-10-08, 09:53:28.377359]
2     [2021-02-23, 09:53:28.377359]
3     [2020-11-01, 14:53:28.377359]
4     [2021-06-05, 13:53:28.377359]
                  ...              
95    [2021-07-19, 13:53:28.377359]
96    [2021-01-12, 19:53:28.377359]
97    [2021-03-25, 11:53:28.377359]
98    [2021-05-13, 15:53:28.377359]
99    [2020-10-11, 16:53:28.377359]
Name: date, Length: 100, dtype: object

date_ = wallet.date.str.split(expand=True)[0]

date_df = date_.str.split("-", expand=True)

date_with_columns = date_df.rename(columns={0:'year',1:'month',2:'day'})

date_with_columns

wallet

wallet['year'] = date_with_columns['year']

wallet.groupby('year').sum()

wallet.groupby(['category','year']).sum()

columns_of_insterest = ['date','category','description','debit', 'year']

w = wallet[columns_of_insterest]

w.groupby(['category','year']).sum()

wallet.groupby(['category','year']).sum()[['debit']] # this is going to give me dataframe

wallet.groupby(['category','year']).sum()['debit'] # this will give series

category  year
Books     2020     695.718715
          2021    4234.031678
Food      2020    2368.925397
          2021    5912.263775
Music     2020    1988.923644
          2021    2244.528224
Travel    2020    1561.630016
          2021    4491.301860
Utility   2020    1625.702397
          2021    5936.564836
Name: debit, dtype: float64

d = wallet.groupby(['category','year']).sum()[['debit']] # this is going to give me dataframe

type(d)

pandas.core.frame.DataFrame

s = wallet.groupby(['category','year']).sum()['debit'] # this will give series

type(s)

pandas.core.series.Series

wallet['debit']

0     421.207327
1     328.440080
2     244.679437
3     222.756318
4     494.128492
         ...    
95    388.671213
96    467.554562
97    320.789434
98    442.096469
99    100.455501
Name: debit, Length: 100, dtype: float64

wallet[['debit']]

Working with virtual environment¶

To create virtual environment on your system (windows)

open cmd

then run

python -m venv NAME_OF_VIRTUAL_ENV

this will create a folder with name NAME_OF_VIRTUAL_ENV for unix/linux/mac

+ NAME_OF_VIRTUAL_ENV
  |
  +-bin
  +-include
  +-lib
  +-lib64
  +-pyenv.cfg

For windows following structure will get created

+ NAME_OF_VIRTUAL_ENV
  |
  +-Include
  +-Lib
  +-Scripts
  +-pyvenv.cfg

to activate environment on windows

c:\Users\vik> NAME_OF_VIRTUAL_ENV\Scripts\activate.bat
(NAME_OF_VIRTUAL_ENV) c:\Users\vik>

to deactivate environment on windows

(NAME_OF_VIRTUAL_ENV) c:\Users\vik> NAME_OF_VIRTUAL_ENV\Scripts\deactivate.bat
c:\Users\vik>

to activate on linux/max/unix

source NAME_OF_VIRTUAL_ENV/bin/activate
(NAME_OF_VIRTUAL_ENV) $

installing packages in virtual environment¶

Activate environment

c:\Users\vik> NAME_OF_VIRTUAL_ENV\Scripts\activate.bat
(NAME_OF_VIRTUAL_ENV) c:\Users\vik>

fire pip command to install packages

(NAME_OF_VIRTUAL_ENV) c:\Users\vik> pip install pandas

TO check what all packages are installed in this virtual environment use pip freeze command

(NAME_OF_VIRTUAL_ENV) c:\Users\vik> pip freeze

requirements.txt¶

this is a text file with one entry on every line which corresponds to package name (and version if required)

%%file requirements.txt
jupyter
pandas
requests
xlrd

Writing requirements.txt

Home assignment¶

create one virtual environment
Activate the environment
use above requirements.txt for installing packages in that environment
launch jupyter from cmd using jupyter lab

	Unnamed: 0	debit
category
Books	43.142857	352.125028
Food	50.962963	306.710710
Music	60.625000	264.590742
Travel	45.062500	378.308242
Utility	47.370370	280.083972

	a	b	c	d
x	1	32	5.0	65.0
y	2	34	6.0	656.0
z	3	23	7.0	67.0
m	4	45	8.0	67.0
n	5	12	NaN	NaN

	Unnamed: 0	date	category	description	debit
0	0	2021-03-07 14:53:28.377359	Music	Amazon	421.207327
1	1	2020-10-08 09:53:28.377359	Food	Swiggy	328.440080
2	2	2021-02-23 09:53:28.377359	Books	Amazon	244.679437
3	3	2020-11-01 14:53:28.377359	Utility	Phone	222.756318
4	4	2021-06-05 13:53:28.377359	Books	Flipcart	494.128492
...	...	...	...	...	...
95	95	2021-07-19 13:53:28.377359	Utility	Phone	388.671213
96	96	2021-01-12 19:53:28.377359	Books	Flipcart	467.554562
97	97	2021-03-25 11:53:28.377359	Utility	Phone	320.789434
98	98	2021-05-13 15:53:28.377359	Travel	Taxi	442.096469
99	99	2020-10-11 16:53:28.377359	Food	Hotel	100.455501

	Unnamed: 0	date	category	description	debit
23	23	2020-12-11 10:53:28.377359	Music	Netflix	354.940241
65	65	2020-10-23 18:53:28.377359	Music	Netflix	188.748743
67	67	2021-07-31 14:53:28.377359	Music	Netflix	324.786917
76	76	2020-11-17 09:53:28.377359	Music	Netflix	197.534600
79	79	2021-08-17 09:53:28.377359	Music	Netflix	321.763416
84	84	2020-09-21 10:53:28.377359	Music	Netflix	158.793646

	Unnamed: 0	debit
category
Books	604	4929.750393
Food	1376	8281.189173
Music	970	4233.451868
Travel	721	6052.931876
Utility	1279	7562.267233

	Unnamed: 0	date	description	debit
category
Books	96	2021-06-30 18:53:28.377359	Flipcart	498.100496
Food	99	2021-08-24 17:53:28.377359	Zomato	489.143483
Music	90	2021-08-17 09:53:28.377359	spotify	421.207327
Travel	98	2021-08-15 17:53:28.377359	Taxi	494.124399
Utility	97	2021-08-23 17:53:28.377359	Phone	499.858182

	value	high	low	volume
APPLE	123	125	120	10
AT&T	130	135	125	101
AGILENT	140	145	138	10
HP	150	155	149	20

	year	month	day
0	2021	03	07
1	2020	10	08
2	2021	02	23
3	2020	11	01
4	2021	06	05
...	...	...	...
95	2021	07	19
96	2021	01	12
97	2021	03	25
98	2021	05	13
99	2020	10	11

		Unnamed: 0	debit
category	year
Books	2020	104	695.718715
Books	2021	500	4234.031678
Food	2020	437	2368.925397
Food	2021	939	5912.263775
Music	2020	552	1988.923644
Music	2021	418	2244.528224
Travel	2020	133	1561.630016
Travel	2021	588	4491.301860
Utility	2020	187	1625.702397
Utility	2021	1092	5936.564836

	a	b	labels
0	1	43	x
1	2	54	y
2	3	23	z
3	4	65	m
4	5	34	n

	c	d	labels
0	1	34	x
1	2	56	y
2	3	56	z
3	4	45	m

	c	d	labels	a	b
0	1	34	x	1	43
1	2	56	y	2	54
2	3	56	z	3	23
3	4	45	m	4	65

	c	d
x	5	65
y	6	656
z	7	67
m	8	67