import pandas as pd


stocks = pd.Series([542,345.8, 454, 500.5], index=["APPLE","AT&T","IBM","NIKE"])


stocks

APPLE    542.0
AT&T     345.8
IBM      454.0
NIKE     500.5
dtype: float64


stocks['APPLE']

542.0


stocks_ = pd.Series([542,345.8, 454, 500.5])


stocks_ # if no labels are given by default it takes it as number starting from 0

0    542.0
1    345.8
2    454.0
3    500.5
dtype: float64


stocks_dict = dict(zip(["APPLE","AT&T","IBM","NIKE"], [542,345.8, 454, 500.5]))


stocks_dict['APPLE']

542


stocks_dict[0]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-10-f7d1349b86c2> in <module>
----> 1 stocks_dict[0]

KeyError: 0


stocks['APPLE']

542.0


stocks[0]

542.0


stocks.mean()

460.575


stocks.std()

84.53907873482731


stocks.isnull() # if some item in series is empty or None or NaN

APPLE    False
AT&T     False
IBM      False
NIKE     False
dtype: bool


stocks.head()

APPLE    542.0
AT&T     345.8
IBM      454.0
NIKE     500.5
dtype: float64


stocks.head(2)

APPLE    542.0
AT&T     345.8
dtype: float64


stocks.sum()

1842.3


score_math1 = pd.Series([87, 88, 99], index=['student1','student2','student2'])


score_math2 = pd.Series([98, 87, 90], index=['student1','student2','student2'])


score_math1

student1    87
student2    88
student2    99
dtype: int64


score_math2

student1    98
student2    87
student2    90
dtype: int64


score_math1 + score_math2

student1    185
student2    175
student2    189
dtype: int64


score_math1/100*10

student1    8.7
student2    8.8
student2    9.9
dtype: float64


df = pd.DataFrame({
    "value": [234.5, 221.6, 125.7, 100.5],
    "high value": [240.3,22.5,127.3,105.0],
    "low value":[233.0,220,123,104.0],
    "volume": [100, 200, 50, 1000]},
    
    index = ["APPLE","AT&T","IBM","NIKE"]
)

df


df['APPLE'] # label will not work for dataframe

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2894             try:
-> 2895                 return self._engine.get_loc(casted_key)
   2896             except KeyError as err:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'APPLE'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-33-5691c2fd1745> in <module>
----> 1 df['APPLE'] # label will not work for dataframe

~/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2900             if self.columns.nlevels > 1:
   2901                 return self._getitem_multilevel(key)
-> 2902             indexer = self.columns.get_loc(key)
   2903             if is_integer(indexer):
   2904                 indexer = [indexer]

~/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2895                 return self._engine.get_loc(casted_key)
   2896             except KeyError as err:
-> 2897                 raise KeyError(key) from err
   2898 
   2899         if tolerance is not None:

KeyError: 'APPLE'


value = df['value']


value

APPLE    234.5
AT&T     221.6
IBM      125.7
NIKE     100.5
Name: value, dtype: float64


value['APPLE']

234.5

df


df.value # only if column names do not have space in it

APPLE    234.5
AT&T     221.6
IBM      125.7
NIKE     100.5
Name: value, dtype: float64


df.high value

  File "<ipython-input-40-bc75c75a26f6>", line 1
    df.high value
            ^
SyntaxError: invalid syntax


df['high value']

APPLE    240.3
AT&T      22.5
IBM      127.3
NIKE     105.0
Name: high value, dtype: float64


df.mean()

value         170.575
high value    123.775
low value     170.000
volume        337.500
dtype: float64


df.describe()


df.head(2)


df.sum() # it is returning series... with column names as label

value          682.3
high value     495.1
low value      680.0
volume        1350.0
dtype: float64


df.sum()['value']

682.3000000000001


df.value.sum()

682.3000000000001


stocks

APPLE    542.0
AT&T     345.8
IBM      454.0
NIKE     500.5
dtype: float64

df


stocks['APPLE'] # by label

542.0


stocks[0] # by index

542.0


df.value # by column name as attribute

APPLE    234.5
AT&T     221.6
IBM      125.7
NIKE     100.5
Name: value, dtype: float64


df['value'] # by column name as dictionary like interface

APPLE    234.5
AT&T     221.6
IBM      125.7
NIKE     100.5
Name: value, dtype: float64


df.loc['APPLE'] # give location (row) with label 'APPLE'

value         234.5
high value    240.3
low value     233.0
volume        100.0
Name: APPLE, dtype: float64


df.iloc[0] # access row of dataframe by index

value         234.5
high value    240.3
low value     233.0
volume        100.0
Name: APPLE, dtype: float64


stocks

APPLE    542.0
AT&T     345.8
IBM      454.0
NIKE     500.5
dtype: float64


stocks[stocks>400] # this is filtering based on some criteria

APPLE    542.0
IBM      454.0
NIKE     500.5
dtype: float64


stocks>400

APPLE     True
AT&T     False
IBM       True
NIKE      True
dtype: bool


stocks[stocks>400]

APPLE    542.0
IBM      454.0
NIKE     500.5
dtype: float64


stocks[0:3] # simple slicing

APPLE    542.0
AT&T     345.8
IBM      454.0
dtype: float64


[s for s in stocks if s > 400] # this very slow and would not prefer to do this with series

df


stocks[stocks<400]

AT&T    345.8
dtype: float64

df


df[df.value<200]


df.value<200

APPLE    False
AT&T     False
IBM       True
NIKE      True
Name: value, dtype: bool


x = df.value<200

x

APPLE    False
AT&T     False
IBM       True
NIKE      True
Name: value, dtype: bool


df[x]


pd.read_csv("wallet.csv")


wallet = pd.read_csv("wallet.csv", index_col=0)


wallet


wallet.describe()


wallet.head()


wallet.debit.sum()

31059.590543177277


music_wallet = wallet[wallet.category=='Music']


music_wallet


music_wallet.debit.sum() # expenses done for music

4233.451868232711


netflix = wallet[wallet.description=="Netflix"]


netflix


netflix.debit.sum()

1546.5675619264125


!python3 -m pip install pandas  openpyxl lxml

Requirement already satisfied: pandas in /home/vikrant/anaconda3/lib/python3.8/site-packages (1.1.3)
Requirement already satisfied: openpyxl in /home/vikrant/anaconda3/lib/python3.8/site-packages (3.0.5)
Requirement already satisfied: lxml in /home/vikrant/anaconda3/lib/python3.8/site-packages (4.6.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /home/vikrant/anaconda3/lib/python3.8/site-packages (from pandas) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /home/vikrant/anaconda3/lib/python3.8/site-packages (from pandas) (2020.1)
Requirement already satisfied: numpy>=1.15.4 in /home/vikrant/anaconda3/lib/python3.8/site-packages (from pandas) (1.19.2)
Requirement already satisfied: jdcal in /home/vikrant/anaconda3/lib/python3.8/site-packages (from openpyxl) (1.4.1)
Requirement already satisfied: et-xmlfile in /home/vikrant/anaconda3/lib/python3.8/site-packages (from openpyxl) (1.0.1)
Requirement already satisfied: six>=1.5 in /home/vikrant/anaconda3/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)


url = "https://raw.githubusercontent.com/vikipedia/python-trainings/master/online_course/source/module2/wallet.csv"
w = pd.read_csv(url,
               index_col=0)

w


%%file stocks.csv
symbol,value,high value,low value,volume
APPLE,234.5,240.3,233.0,100
AT&T,221.6,22.5,220.0,200
IBM,125.7,127.3,123.0,50
NIKE,100.5,105.0,104.0,1000

Writing stocks.csv


pd.read_csv("stocks.csv")


stocks =pd.read_csv("stocks.csv", index_col=0)


stocks


stocks.value['APPLE']

234.5


%%file stocks1.csv
symbol,value,high value,low value,volume
APPLE,234.5,240.3,233.0,
AT&T,221.6,22.5,220.0,200
IBM,125.7,127.3,,50
NIKE,100.5,105.0,104.0,1000

Writing stocks1.csv


stocks1 = pd.read_csv("stocks1.csv", index_col=0)


stocks1


stocks1.isnull().sum()

value         0
high value    0
low value     1
volume        1
dtype: int64


stocks1.sum()

value          682.3
high value     495.1
low value      557.0
volume        1250.0
dtype: float64


pd.DataFrame({"y1":[90,30],
              "y2":[80,68]},
             index = ["maths",'english'])


y1 = pd.Series([90,30], index=['maths','english'])
y2 = pd.Series([80,68], index=['maths','english'])
pd.DataFrame({"y1":y1, "y2":y2})


y1 = pd.Series([90,30], index=['maths','english'])
y2 = pd.Series([80,68], index=['hindi','english'])
pd.DataFrame({"y1":y1, "y2":y2})


wallet.date

0     2021-03-07 14:53:28.377359
1     2020-10-08 09:53:28.377359
2     2021-02-23 09:53:28.377359
3     2020-11-01 14:53:28.377359
4     2021-06-05 13:53:28.377359
                 ...            
95    2021-07-19 13:53:28.377359
96    2021-01-12 19:53:28.377359
97    2021-03-25 11:53:28.377359
98    2021-05-13 15:53:28.377359
99    2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: object


wallet.date.str.split(" ")

0     [2021-03-07, 14:53:28.377359]
1     [2020-10-08, 09:53:28.377359]
2     [2021-02-23, 09:53:28.377359]
3     [2020-11-01, 14:53:28.377359]
4     [2021-06-05, 13:53:28.377359]
                  ...              
95    [2021-07-19, 13:53:28.377359]
96    [2021-01-12, 19:53:28.377359]
97    [2021-03-25, 11:53:28.377359]
98    [2021-05-13, 15:53:28.377359]
99    [2020-10-11, 16:53:28.377359]
Name: date, Length: 100, dtype: object


timecols_df = wallet.date.str.split(" ", expand=True)


timecols_df


timecols_df.rename(columns={0:"date", 1:"timestamp"})


timecols_df = timecols_df.rename(columns={0:"date", 1:"timestamp"})


timecols_df


wallet


del wallet['date'] # this has modified original dataframe


wallet


wallet['date'] = timecols_df['date']


wallet['timestamp'] = timecols_df['timestamp']


wallet


wallet[wallet.date=="2021-03-07"]


category_group  = wallet.groupby("category")


category_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd2f20f2040>


category_group.sum()# give me mean debit (or all numeric columns) by category


category_group.mean() # give me mean debit (or all numeric columns) by category


multiple_groups = wallet.groupby(['category', 'description'])


multiple_groups.sum()


wallet[wallet.category=='Music'].sum()

category       MusicMusicMusicMusicMusicMusicMusicMusicMusicM...
description    AmazonAmazonNetflixAmazonspotifyspotifyNetflix...
debit                                                    4233.45
date           2021-03-072021-07-242020-12-112021-04-182021-0...
timestamp      14:53:28.37735914:53:28.37735910:53:28.3773591...
dtype: object


category_group.sum()


wallet


wallet.to_csv("wallet_from_frame.csv")


!python3 head.py wallet_from_frame.csv

,category,description,debit,date,timestamp
0,Music,Amazon,421.2073272347991,2021-03-07,14:53:28.377359
1,Food,Swiggy,328.4400802428426,2020-10-08,09:53:28.377359
2,Books,Amazon,244.67943701511356,2021-02-23,09:53:28.377359
3,Utility,Phone,222.75631758052768,2020-11-01,14:53:28.377359
4,Books,Flipcart,494.1284923793595,2021-06-05,13:53:28.377359
5,Utility,Electricity,219.9417113096841,2021-07-28,19:53:28.377359
6,Books,Amazon Kindle,270.32259514795845,2021-04-16,11:53:28.377359
7,Food,Zomato,457.1831036346536,2021-02-15,10:53:28.377359
8,Utility,Phone,151.4963725994779,2021-08-10,19:53:28.377359


wallet.to_csv("wallet_from_frame.csv", index=False)


!python3 head.py wallet_from_frame.csv

category,description,debit,date,timestamp
Music,Amazon,421.2073272347991,2021-03-07,14:53:28.377359
Food,Swiggy,328.4400802428426,2020-10-08,09:53:28.377359
Books,Amazon,244.67943701511356,2021-02-23,09:53:28.377359
Utility,Phone,222.75631758052768,2020-11-01,14:53:28.377359
Books,Flipcart,494.1284923793595,2021-06-05,13:53:28.377359
Utility,Electricity,219.9417113096841,2021-07-28,19:53:28.377359
Books,Amazon Kindle,270.32259514795845,2021-04-16,11:53:28.377359
Food,Zomato,457.1831036346536,2021-02-15,10:53:28.377359
Utility,Phone,151.4963725994779,2021-08-10,19:53:28.377359


music_wallet


category_group.sum()


category_group.sum().to_csv("wallet_sum_by_category.csv")


!python3 cat.py wallet_sum_by_category.csv

category,debit
Books,4929.750393283797
Food,8281.189172581233
Music,4233.451868232711
Travel,6052.931876440963
Utility,7562.267232638568


writer = pd.ExcelWriter("wallet1.xlsx", engine="xlsxwriter")
wallet.to_excel(writer, sheet_name="statement")
writer.save()


!ls wallet*

wallet1.xlsx  wallet.csv  wallet_from_frame.csv  wallet_sum_by_category.csv

	value	high value	low value	volume
count	4.000000	4.000000	4.000000	4.000000
mean	170.575000	123.775000	170.000000	337.500000
std	67.365242	89.813933	65.914085	446.047456
min	100.500000	22.500000	104.000000	50.000000
25%	119.400000	84.375000	118.250000	87.500000
50%	173.650000	116.150000	171.500000	150.000000
75%	224.825000	155.550000	223.250000	400.000000
max	234.500000	240.300000	233.000000	1000.000000

	debit
count	100.000000
mean	310.595905
std	121.178218
min	100.455501
25%	216.429447
50%	318.110776
75%	424.109079
max	499.858182

Python Virtual Training For Arcesium - Module III - Day 2¶

Series and DataFrame¶

accessing items from series/DataFrame¶

Selection¶

More detailed documents for pandas¶

Virtual ennvironment¶

	value	high value	low value	volume
APPLE	234.5	240.3	233.0	100
AT&T	221.6	22.5	220.0	200
IBM	125.7	127.3	123.0	50
NIKE	100.5	105.0	104.0	1000

	Unnamed: 0	date	category	description	debit
0	0	2021-03-07 14:53:28.377359	Music	Amazon	421.207327
1	1	2020-10-08 09:53:28.377359	Food	Swiggy	328.440080
2	2	2021-02-23 09:53:28.377359	Books	Amazon	244.679437
3	3	2020-11-01 14:53:28.377359	Utility	Phone	222.756318
4	4	2021-06-05 13:53:28.377359	Books	Flipcart	494.128492
...	...	...	...	...	...
95	95	2021-07-19 13:53:28.377359	Utility	Phone	388.671213
96	96	2021-01-12 19:53:28.377359	Books	Flipcart	467.554562
97	97	2021-03-25 11:53:28.377359	Utility	Phone	320.789434
98	98	2021-05-13 15:53:28.377359	Travel	Taxi	442.096469
99	99	2020-10-11 16:53:28.377359	Food	Hotel	100.455501

	date	category	description	debit
23	2020-12-11 10:53:28.377359	Music	Netflix	354.940241
65	2020-10-23 18:53:28.377359	Music	Netflix	188.748743
67	2021-07-31 14:53:28.377359	Music	Netflix	324.786917
76	2020-11-17 09:53:28.377359	Music	Netflix	197.534600
79	2021-08-17 09:53:28.377359	Music	Netflix	321.763416
84	2020-09-21 10:53:28.377359	Music	Netflix	158.793646

	value	high value	low value	volume
symbol
APPLE	234.5	240.3	233.0	100
AT&T	221.6	22.5	220.0	200
IBM	125.7	127.3	123.0	50
NIKE	100.5	105.0	104.0	1000

	0	1
0	2021-03-07	14:53:28.377359
1	2020-10-08	09:53:28.377359
2	2021-02-23	09:53:28.377359
3	2020-11-01	14:53:28.377359
4	2021-06-05	13:53:28.377359
...	...	...
95	2021-07-19	13:53:28.377359
96	2021-01-12	19:53:28.377359
97	2021-03-25	11:53:28.377359
98	2021-05-13	15:53:28.377359
99	2020-10-11	16:53:28.377359

	debit
category
Books	4929.750393
Food	8281.189173
Music	4233.451868
Travel	6052.931876
Utility	7562.267233

	debit
category
Books	352.125028
Food	306.710710
Music	264.590742
Travel	378.308242
Utility	280.083972

		debit
category	description
Books	Amazon	1037.442802
	Amazon Kindle	1389.052376
	Flipcart	2503.255216
Food	Hotel	2752.174732
	Swiggy	1936.495366
	Zomato	3592.519075
Music	Amazon	1467.247766
	Netflix	1546.567562
	spotify	1219.636541
Travel	Auto	2210.428935
	Metro	1216.463665
	Taxi	2626.039276
Utility	Electricity	2885.064355
Utility	Phone	4677.202878

	y1	y2
maths	90	80
english	30	68

	y1	y2
maths	90	80
english	30	68

	y1	y2
english	30.0	68.0
hindi	NaN	80.0
maths	90.0	NaN