Mar 13-17, 2023 Vikrant Patil
All notes are available online at https://notes.pipal.in/2023/arcesium_finop_jan/
Please login to https://engage.pipal.in/ and launch jupyter lab
For today create a notebook with name module3-day1
notebook names are case sensitive. Make sure you give correct name
© Pipal Academy LLP
import pandas as pd
file_url = "https://raw.githubusercontent.com/vikipedia/python-trainings/master/online_course/source/module2/wallet.csv"
pd.read_csv(file_url)
--------------------------------------------------------------------------- ConnectionResetError Traceback (most recent call last) File /usr/lib/python3.10/urllib/request.py:1348, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args) 1347 try: -> 1348 h.request(req.get_method(), req.selector, req.data, headers, 1349 encode_chunked=req.has_header('Transfer-encoding')) 1350 except OSError as err: # timeout error File /usr/lib/python3.10/http/client.py:1282, in HTTPConnection.request(self, method, url, body, headers, encode_chunked) 1281 """Send a complete request to the server.""" -> 1282 self._send_request(method, url, body, headers, encode_chunked) File /usr/lib/python3.10/http/client.py:1328, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked) 1327 body = _encode(body, 'body') -> 1328 self.endheaders(body, encode_chunked=encode_chunked) File /usr/lib/python3.10/http/client.py:1277, in HTTPConnection.endheaders(self, message_body, encode_chunked) 1276 raise CannotSendHeader() -> 1277 self._send_output(message_body, encode_chunked=encode_chunked) File /usr/lib/python3.10/http/client.py:1037, in HTTPConnection._send_output(self, message_body, encode_chunked) 1036 del self._buffer[:] -> 1037 self.send(msg) 1039 if message_body is not None: 1040 1041 # create a consistent interface to message_body File /usr/lib/python3.10/http/client.py:975, in HTTPConnection.send(self, data) 974 if self.auto_open: --> 975 self.connect() 976 else: File /usr/lib/python3.10/http/client.py:1454, in HTTPSConnection.connect(self) 1452 server_hostname = self.host -> 1454 self.sock = self._context.wrap_socket(self.sock, 1455 server_hostname=server_hostname) File /usr/lib/python3.10/ssl.py:513, in SSLContext.wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session) 507 def wrap_socket(self, sock, server_side=False, 508 do_handshake_on_connect=True, 509 suppress_ragged_eofs=True, 510 server_hostname=None, session=None): 511 # SSLSocket class handles server_hostname encoding before it calls 512 # ctx._wrap_socket() --> 513 return self.sslsocket_class._create( 514 sock=sock, 515 server_side=server_side, 516 do_handshake_on_connect=do_handshake_on_connect, 517 suppress_ragged_eofs=suppress_ragged_eofs, 518 server_hostname=server_hostname, 519 context=self, 520 session=session 521 ) File /usr/lib/python3.10/ssl.py:1071, in SSLSocket._create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session) 1070 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets") -> 1071 self.do_handshake() 1072 except (OSError, ValueError): File /usr/lib/python3.10/ssl.py:1342, in SSLSocket.do_handshake(self, block) 1341 self.settimeout(None) -> 1342 self._sslobj.do_handshake() 1343 finally: ConnectionResetError: [Errno 104] Connection reset by peer During handling of the above exception, another exception occurred: URLError Traceback (most recent call last) Cell In[285], line 1 ----> 1 pd.read_csv(file_url) File ~/usr/local/default/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File ~/usr/local/default/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(*args, **kwargs) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options) 935 kwds_defaults = _refine_defaults_read( 936 dialect, 937 delimiter, (...) 946 defaults={"delimiter": ","}, 947 ) 948 kwds.update(kwds_defaults) --> 950 return _read(filepath_or_buffer, kwds) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds) 602 _validate_names(kwds.get("names", None)) 604 # Create the parser. --> 605 parser = TextFileReader(filepath_or_buffer, **kwds) 607 if chunksize or iterator: 608 return parser File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds) 1439 self.options["has_index_names"] = kwds["has_index_names"] 1441 self.handles: IOHandles | None = None -> 1442 self._engine = self._make_engine(f, self.engine) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine) 1733 if "b" not in mode: 1734 mode += "b" -> 1735 self.handles = get_handle( 1736 f, 1737 mode, 1738 encoding=self.options.get("encoding", None), 1739 compression=self.options.get("compression", None), 1740 memory_map=self.options.get("memory_map", False), 1741 is_text=is_text, 1742 errors=self.options.get("encoding_errors", "strict"), 1743 storage_options=self.options.get("storage_options", None), 1744 ) 1745 assert self.handles is not None 1746 f = self.handles.handle File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/common.py:713, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 710 codecs.lookup_error(errors) 712 # open URLs --> 713 ioargs = _get_filepath_or_buffer( 714 path_or_buf, 715 encoding=encoding, 716 compression=compression, 717 mode=mode, 718 storage_options=storage_options, 719 ) 721 handle = ioargs.filepath_or_buffer 722 handles: list[BaseBuffer] File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/common.py:363, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options) 361 # assuming storage_options is to be interpreted as headers 362 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options) --> 363 with urlopen(req_info) as req: 364 content_encoding = req.headers.get("Content-Encoding", None) 365 if content_encoding == "gzip": 366 # Override compression based on Content-Encoding header File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/common.py:265, in urlopen(*args, **kwargs) 259 """ 260 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of 261 the stdlib. 262 """ 263 import urllib.request --> 265 return urllib.request.urlopen(*args, **kwargs) File /usr/lib/python3.10/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context) 214 else: 215 opener = _opener --> 216 return opener.open(url, data, timeout) File /usr/lib/python3.10/urllib/request.py:519, in OpenerDirector.open(self, fullurl, data, timeout) 516 req = meth(req) 518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method()) --> 519 response = self._open(req, data) 521 # post-process response 522 meth_name = protocol+"_response" File /usr/lib/python3.10/urllib/request.py:536, in OpenerDirector._open(self, req, data) 533 return result 535 protocol = req.type --> 536 result = self._call_chain(self.handle_open, protocol, protocol + 537 '_open', req) 538 if result: 539 return result File /usr/lib/python3.10/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args) 494 for handler in handlers: 495 func = getattr(handler, meth_name) --> 496 result = func(*args) 497 if result is not None: 498 return result File /usr/lib/python3.10/urllib/request.py:1391, in HTTPSHandler.https_open(self, req) 1390 def https_open(self, req): -> 1391 return self.do_open(http.client.HTTPSConnection, req, 1392 context=self._context, check_hostname=self._check_hostname) File /usr/lib/python3.10/urllib/request.py:1351, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args) 1348 h.request(req.get_method(), req.selector, req.data, headers, 1349 encode_chunked=req.has_header('Transfer-encoding')) 1350 except OSError as err: # timeout error -> 1351 raise URLError(err) 1352 r = h.getresponse() 1353 except: URLError: <urlopen error [Errno 104] Connection reset by peer>
excel_url = "https://raw.githubusercontent.com/vikipedia/python-trainings/master/online_course/source/module2/wallet.xlsx"
pd.read_excel(excel_url)
--------------------------------------------------------------------------- ConnectionResetError Traceback (most recent call last) File /usr/lib/python3.10/urllib/request.py:1348, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args) 1347 try: -> 1348 h.request(req.get_method(), req.selector, req.data, headers, 1349 encode_chunked=req.has_header('Transfer-encoding')) 1350 except OSError as err: # timeout error File /usr/lib/python3.10/http/client.py:1282, in HTTPConnection.request(self, method, url, body, headers, encode_chunked) 1281 """Send a complete request to the server.""" -> 1282 self._send_request(method, url, body, headers, encode_chunked) File /usr/lib/python3.10/http/client.py:1328, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked) 1327 body = _encode(body, 'body') -> 1328 self.endheaders(body, encode_chunked=encode_chunked) File /usr/lib/python3.10/http/client.py:1277, in HTTPConnection.endheaders(self, message_body, encode_chunked) 1276 raise CannotSendHeader() -> 1277 self._send_output(message_body, encode_chunked=encode_chunked) File /usr/lib/python3.10/http/client.py:1037, in HTTPConnection._send_output(self, message_body, encode_chunked) 1036 del self._buffer[:] -> 1037 self.send(msg) 1039 if message_body is not None: 1040 1041 # create a consistent interface to message_body File /usr/lib/python3.10/http/client.py:975, in HTTPConnection.send(self, data) 974 if self.auto_open: --> 975 self.connect() 976 else: File /usr/lib/python3.10/http/client.py:1454, in HTTPSConnection.connect(self) 1452 server_hostname = self.host -> 1454 self.sock = self._context.wrap_socket(self.sock, 1455 server_hostname=server_hostname) File /usr/lib/python3.10/ssl.py:513, in SSLContext.wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session) 507 def wrap_socket(self, sock, server_side=False, 508 do_handshake_on_connect=True, 509 suppress_ragged_eofs=True, 510 server_hostname=None, session=None): 511 # SSLSocket class handles server_hostname encoding before it calls 512 # ctx._wrap_socket() --> 513 return self.sslsocket_class._create( 514 sock=sock, 515 server_side=server_side, 516 do_handshake_on_connect=do_handshake_on_connect, 517 suppress_ragged_eofs=suppress_ragged_eofs, 518 server_hostname=server_hostname, 519 context=self, 520 session=session 521 ) File /usr/lib/python3.10/ssl.py:1071, in SSLSocket._create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session) 1070 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets") -> 1071 self.do_handshake() 1072 except (OSError, ValueError): File /usr/lib/python3.10/ssl.py:1342, in SSLSocket.do_handshake(self, block) 1341 self.settimeout(None) -> 1342 self._sslobj.do_handshake() 1343 finally: ConnectionResetError: [Errno 104] Connection reset by peer During handling of the above exception, another exception occurred: URLError Traceback (most recent call last) Cell In[8], line 1 ----> 1 pd.read_excel(excel_url) File ~/usr/local/default/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File ~/usr/local/default/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(*args, **kwargs) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/excel/_base.py:482, in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, decimal, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options) 480 if not isinstance(io, ExcelFile): 481 should_close = True --> 482 io = ExcelFile(io, storage_options=storage_options, engine=engine) 483 elif engine and engine != io.engine: 484 raise ValueError( 485 "Engine should not be specified when passing " 486 "an ExcelFile - ExcelFile already has the engine set" 487 ) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/excel/_base.py:1652, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options) 1650 ext = "xls" 1651 else: -> 1652 ext = inspect_excel_format( 1653 content_or_path=path_or_buffer, storage_options=storage_options 1654 ) 1655 if ext is None: 1656 raise ValueError( 1657 "Excel file format cannot be determined, you must specify " 1658 "an engine manually." 1659 ) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/excel/_base.py:1525, in inspect_excel_format(content_or_path, storage_options) 1522 if isinstance(content_or_path, bytes): 1523 content_or_path = BytesIO(content_or_path) -> 1525 with get_handle( 1526 content_or_path, "rb", storage_options=storage_options, is_text=False 1527 ) as handle: 1528 stream = handle.handle 1529 stream.seek(0) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/common.py:713, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 710 codecs.lookup_error(errors) 712 # open URLs --> 713 ioargs = _get_filepath_or_buffer( 714 path_or_buf, 715 encoding=encoding, 716 compression=compression, 717 mode=mode, 718 storage_options=storage_options, 719 ) 721 handle = ioargs.filepath_or_buffer 722 handles: list[BaseBuffer] File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/common.py:363, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options) 361 # assuming storage_options is to be interpreted as headers 362 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options) --> 363 with urlopen(req_info) as req: 364 content_encoding = req.headers.get("Content-Encoding", None) 365 if content_encoding == "gzip": 366 # Override compression based on Content-Encoding header File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/common.py:265, in urlopen(*args, **kwargs) 259 """ 260 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of 261 the stdlib. 262 """ 263 import urllib.request --> 265 return urllib.request.urlopen(*args, **kwargs) File /usr/lib/python3.10/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context) 214 else: 215 opener = _opener --> 216 return opener.open(url, data, timeout) File /usr/lib/python3.10/urllib/request.py:519, in OpenerDirector.open(self, fullurl, data, timeout) 516 req = meth(req) 518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method()) --> 519 response = self._open(req, data) 521 # post-process response 522 meth_name = protocol+"_response" File /usr/lib/python3.10/urllib/request.py:536, in OpenerDirector._open(self, req, data) 533 return result 535 protocol = req.type --> 536 result = self._call_chain(self.handle_open, protocol, protocol + 537 '_open', req) 538 if result: 539 return result File /usr/lib/python3.10/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args) 494 for handler in handlers: 495 func = getattr(handler, meth_name) --> 496 result = func(*args) 497 if result is not None: 498 return result File /usr/lib/python3.10/urllib/request.py:1391, in HTTPSHandler.https_open(self, req) 1390 def https_open(self, req): -> 1391 return self.do_open(http.client.HTTPSConnection, req, 1392 context=self._context, check_hostname=self._check_hostname) File /usr/lib/python3.10/urllib/request.py:1351, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args) 1348 h.request(req.get_method(), req.selector, req.data, headers, 1349 encode_chunked=req.has_header('Transfer-encoding')) 1350 except OSError as err: # timeout error -> 1351 raise URLError(err) 1352 r = h.getresponse() 1353 except: URLError: <urlopen error [Errno 104] Connection reset by peer>
pd.read_csv("wallet.csv")
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... | ... |
| 95 | 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 5 columns
wallet = pd.read_csv("wallet.csv")
wallet
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... | ... |
| 95 | 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 5 columns
type(wallet)
pandas.core.frame.DataFrame
pd.read_excel("wallet.xlsx")
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... | ... |
| 95 | 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 5 columns
help(pd.read_csv)
Help on function read_csv in module pandas.io.parsers.readers:
read_csv(filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', *, sep: 'str | None | lib.NoDefault' = <no_default>, delimiter: 'str | None | lib.NoDefault' = None, header: "int | Sequence[int] | None | Literal['infer']" = 'infer', names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>, index_col: 'IndexLabel | Literal[False] | None' = None, usecols=None, squeeze: 'bool | None' = None, prefix: 'str | lib.NoDefault' = <no_default>, mangle_dupe_cols: 'bool' = True, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters=None, true_values=None, false_values=None, skipinitialspace: 'bool' = False, skiprows=None, skipfooter: 'int' = 0, nrows: 'int | None' = None, na_values=None, keep_default_na: 'bool' = True, na_filter: 'bool' = True, verbose: 'bool' = False, skip_blank_lines: 'bool' = True, parse_dates=None, infer_datetime_format: 'bool' = False, keep_date_col: 'bool' = False, date_parser=None, dayfirst: 'bool' = False, cache_dates: 'bool' = True, iterator: 'bool' = False, chunksize: 'int | None' = None, compression: 'CompressionOptions' = 'infer', thousands: 'str | None' = None, decimal: 'str' = '.', lineterminator: 'str | None' = None, quotechar: 'str' = '"', quoting: 'int' = 0, doublequote: 'bool' = True, escapechar: 'str | None' = None, comment: 'str | None' = None, encoding: 'str | None' = None, encoding_errors: 'str | None' = 'strict', dialect: 'str | csv.Dialect | None' = None, error_bad_lines: 'bool | None' = None, warn_bad_lines: 'bool | None' = None, on_bad_lines=None, delim_whitespace: 'bool' = False, low_memory=True, memory_map: 'bool' = False, float_precision: "Literal['high', 'legacy'] | None" = None, storage_options: 'StorageOptions' = None) -> 'DataFrame | TextFileReader'
Read a comma-separated values (csv) file into DataFrame.
Also supports optionally iterating or breaking of the file
into chunks.
Additional help can be found in the online docs for
`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
Parameters
----------
filepath_or_buffer : str, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
expected. A local file could be: file://localhost/path/to/table.csv.
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
By file-like object, we refer to objects with a ``read()`` method, such as
a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
sep : str, default ','
Delimiter to use. If sep is None, the C engine cannot automatically detect
the separator, but the Python parsing engine can, meaning the latter will
be used and automatically detect the separator by Python's builtin sniffer
tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
different from ``'\s+'`` will be interpreted as regular expressions and
will also force the use of the Python parsing engine. Note that regex
delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
delimiter : str, default ``None``
Alias for sep.
header : int, list of int, None, default 'infer'
Row number(s) to use as the column names, and the start of the
data. Default behavior is to infer the column names: if no names
are passed the behavior is identical to ``header=0`` and column
names are inferred from the first line of the file, if column
names are passed explicitly then the behavior is identical to
``header=None``. Explicitly pass ``header=0`` to be able to
replace existing names. The header can be a list of integers that
specify row locations for a multi-index on the columns
e.g. [0,1,3]. Intervening rows that are not specified will be
skipped (e.g. 2 in this example is skipped). Note that this
parameter ignores commented lines and empty lines if
``skip_blank_lines=True``, so ``header=0`` denotes the first line of
data rather than the first line of the file.
names : array-like, optional
List of column names to use. If the file contains a header row,
then you should explicitly pass ``header=0`` to override the column names.
Duplicates in this list are not allowed.
index_col : int, str, sequence of int / str, or False, optional, default ``None``
Column(s) to use as the row labels of the ``DataFrame``, either given as
string name or column index. If a sequence of int / str is given, a
MultiIndex is used.
Note: ``index_col=False`` can be used to force pandas to *not* use the first
column as the index, e.g. when you have a malformed file with delimiters at
the end of each line.
usecols : list-like or callable, optional
Return a subset of the columns. If list-like, all elements must either
be positional (i.e. integer indices into the document columns) or strings
that correspond to column names provided either by the user in `names` or
inferred from the document header row(s). If ``names`` are given, the document
header row(s) are not taken into account. For example, a valid list-like
`usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
To instantiate a DataFrame from ``data`` with element order preserved use
``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
in ``['foo', 'bar']`` order or
``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
for ``['bar', 'foo']`` order.
If callable, the callable function will be evaluated against the column
names, returning names where the callable function evaluates to True. An
example of a valid callable argument would be ``lambda x: x.upper() in
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
parsing time and lower memory usage.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.
.. deprecated:: 1.4.0
Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
the data.
prefix : str, optional
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
.. deprecated:: 1.4.0
Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
mangle_dupe_cols : bool, default True
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.
.. deprecated:: 1.5.0
Not implemented, and a new argument to specify the pattern for the
names of duplicated columns will be added instead
dtype : Type name or dict of column -> type, optional
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
'c': 'Int64'}
Use `str` or `object` together with suitable `na_values` settings
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.
.. versionadded:: 1.5.0
Support for defaultdict was added. Specify a defaultdict as input where
the default determines the dtype of the columns which are not explicitly
listed.
engine : {'c', 'python', 'pyarrow'}, optional
Parser engine to use. The C and pyarrow engines are faster, while the python engine
is currently more feature-complete. Multithreading is currently only supported by
the pyarrow engine.
.. versionadded:: 1.4.0
The "pyarrow" engine was added as an *experimental* engine, and some features
are unsupported, or may not work correctly, with this engine.
converters : dict, optional
Dict of functions for converting values in certain columns. Keys can either
be integers or column labels.
true_values : list, optional
Values to consider as True.
false_values : list, optional
Values to consider as False.
skipinitialspace : bool, default False
Skip spaces after delimiter.
skiprows : list-like, int or callable, optional
Line numbers to skip (0-indexed) or number of lines to skip (int)
at the start of the file.
If callable, the callable function will be evaluated against the row
indices, returning True if the row should be skipped and False otherwise.
An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
skipfooter : int, default 0
Number of lines at bottom of file to skip (Unsupported with engine='c').
nrows : int, optional
Number of rows of file to read. Useful for reading pieces of large files.
na_values : scalar, str, list-like, or dict, optional
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. By default the following values are interpreted as
NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
'1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
'nan', 'null'.
keep_default_na : bool, default True
Whether or not to include the default NaN values when parsing the data.
Depending on whether `na_values` is passed in, the behavior is as follows:
* If `keep_default_na` is True, and `na_values` are specified, `na_values`
is appended to the default NaN values used for parsing.
* If `keep_default_na` is True, and `na_values` are not specified, only
the default NaN values are used for parsing.
* If `keep_default_na` is False, and `na_values` are specified, only
the NaN values specified `na_values` are used for parsing.
* If `keep_default_na` is False, and `na_values` are not specified, no
strings will be parsed as NaN.
Note that if `na_filter` is passed in as False, the `keep_default_na` and
`na_values` parameters will be ignored.
na_filter : bool, default True
Detect missing value markers (empty strings and the value of na_values). In
data without any NAs, passing na_filter=False can improve the performance
of reading a large file.
verbose : bool, default False
Indicate number of NA values placed in non-numeric columns.
skip_blank_lines : bool, default True
If True, skip over blank lines rather than interpreting as NaN values.
parse_dates : bool or list of int or names or list of lists or dict, default False
The behavior is as follows:
* boolean. If True -> try parsing the index.
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
each as a separate date column.
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
a single date column.
* dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
result 'foo'
If a column or index cannot be represented as an array of datetimes,
say because of an unparsable value or a mixture of timezones, the column
or index will be returned unaltered as an object data type. For
non-standard datetime parsing, use ``pd.to_datetime`` after
``pd.read_csv``. To parse an index or column with a mixture of timezones,
specify ``date_parser`` to be a partially-applied
:func:`pandas.to_datetime` with ``utc=True``. See
:ref:`io.csv.mixed_timezones` for more.
Note: A fast-path exists for iso8601-formatted dates.
infer_datetime_format : bool, default False
If True and `parse_dates` is enabled, pandas will attempt to infer the
format of the datetime strings in the columns, and if it can be inferred,
switch to a faster method of parsing them. In some cases this can increase
the parsing speed by 5-10x.
keep_date_col : bool, default False
If True and `parse_dates` specifies combining multiple columns then
keep the original columns.
date_parser : function, optional
Function to use for converting a sequence of string columns to an array of
datetime instances. The default uses ``dateutil.parser.parser`` to do the
conversion. Pandas will try to call `date_parser` in three different ways,
advancing to the next if an exception occurs: 1) Pass one or more arrays
(as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
string values from the columns defined by `parse_dates` into a single array
and pass that; and 3) call `date_parser` once for each row using one or
more strings (corresponding to the columns defined by `parse_dates`) as
arguments.
dayfirst : bool, default False
DD/MM format dates, international and European format.
cache_dates : bool, default True
If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.
.. versionadded:: 0.25.0
iterator : bool, default False
Return TextFileReader object for iteration or getting chunks with
``get_chunk()``.
.. versionchanged:: 1.2
``TextFileReader`` is a context manager.
chunksize : int, optional
Return TextFileReader object for iteration.
See the `IO Tools docs
<https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
for more information on ``iterator`` and ``chunksize``.
.. versionchanged:: 1.2
``TextFileReader`` is a context manager.
compression : str or dict, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is
path-like, then detect compression from the following extensions: '.gz',
'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
(otherwise no compression).
If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in.
Set to ``None`` for no decompression.
Can also be a dict with key ``'method'`` set
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
key-value pairs are forwarded to
``zipfile.ZipFile``, ``gzip.GzipFile``,
``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or
``tarfile.TarFile``, respectively.
As an example, the following could be passed for Zstandard decompression using a
custom compression dictionary:
``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
.. versionadded:: 1.5.0
Added support for `.tar` files.
.. versionchanged:: 1.4.0 Zstandard support.
thousands : str, optional
Thousands separator.
decimal : str, default '.'
Character to recognize as decimal point (e.g. use ',' for European data).
lineterminator : str (length 1), optional
Character to break file into lines. Only valid with C parser.
quotechar : str (length 1), optional
The character used to denote the start and end of a quoted item. Quoted
items can include the delimiter and it will be ignored.
quoting : int or csv.QUOTE_* instance, default 0
Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
doublequote : bool, default ``True``
When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
whether or not to interpret two consecutive quotechar elements INSIDE a
field as a single ``quotechar`` element.
escapechar : str (length 1), optional
One-character string used to escape other characters.
comment : str, optional
Indicates remainder of line should not be parsed. If found at the beginning
of a line, the line will be ignored altogether. This parameter must be a
single character. Like empty lines (as long as ``skip_blank_lines=True``),
fully commented lines are ignored by the parameter `header` but not by
`skiprows`. For example, if ``comment='#'``, parsing
``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
treated as the header.
encoding : str, optional
Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
standard encodings
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
.. versionchanged:: 1.2
When ``encoding`` is ``None``, ``errors="replace"`` is passed to
``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
This behavior was previously only the case for ``engine="python"``.
.. versionchanged:: 1.3.0
``encoding_errors`` is a new argument. ``encoding`` has no longer an
influence on how encoding errors are handled.
encoding_errors : str, optional, default "strict"
How encoding errors are treated. `List of possible values
<https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
.. versionadded:: 1.3.0
dialect : str or csv.Dialect, optional
If provided, this parameter will override values (default or not) for the
following parameters: `delimiter`, `doublequote`, `escapechar`,
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
override values, a ParserWarning will be issued. See csv.Dialect
documentation for more details.
error_bad_lines : bool, optional, default ``None``
Lines with too many fields (e.g. a csv line with too many commas) will by
default cause an exception to be raised, and no DataFrame will be returned.
If False, then these "bad lines" will be dropped from the DataFrame that is
returned.
.. deprecated:: 1.3.0
The ``on_bad_lines`` parameter should be used instead to specify behavior upon
encountering a bad line instead.
warn_bad_lines : bool, optional, default ``None``
If error_bad_lines is False, and warn_bad_lines is True, a warning for each
"bad line" will be output.
.. deprecated:: 1.3.0
The ``on_bad_lines`` parameter should be used instead to specify behavior upon
encountering a bad line instead.
on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
Specifies what to do upon encountering a bad line (a line with too many fields).
Allowed values are :
- 'error', raise an Exception when a bad line is encountered.
- 'warn', raise a warning when a bad line is encountered and skip that line.
- 'skip', skip bad lines without raising or warning when they are encountered.
.. versionadded:: 1.3.0
.. versionadded:: 1.4.0
- callable, function with signature
``(bad_line: list[str]) -> list[str] | None`` that will process a single
bad line. ``bad_line`` is a list of strings split by the ``sep``.
If the function returns ``None``, the bad line will be ignored.
If the function returns a new list of strings with more elements than
expected, a ``ParserWarning`` will be emitted while dropping extra elements.
Only supported when ``engine="python"``
delim_whitespace : bool, default False
Specifies whether or not whitespace (e.g. ``' '`` or ``' '``) will be
used as the sep. Equivalent to setting ``sep='\s+'``. If this option
is set to True, nothing should be passed in for the ``delimiter``
parameter.
low_memory : bool, default True
Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
types either set False, or specify the type with the `dtype` parameter.
Note that the entire file is read into a single DataFrame regardless,
use the `chunksize` or `iterator` parameter to return the data in chunks.
(Only valid with C parser).
memory_map : bool, default False
If a filepath is provided for `filepath_or_buffer`, map the file object
directly onto memory and access the data directly from there. Using this
option can improve performance because there is no longer any I/O overhead.
float_precision : str, optional
Specifies which converter the C engine should use for floating-point
values. The options are ``None`` or 'high' for the ordinary converter,
'legacy' for the original lower precision pandas converter, and
'round_trip' for the round-trip converter.
.. versionchanged:: 1.2
storage_options : dict, optional
Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to ``urllib.request.Request`` as header options. For other
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
details, and for more examples on storage options refer `here
<https://pandas.pydata.org/docs/user_guide/io.html?
highlight=storage_options#reading-writing-remote-files>`_.
.. versionadded:: 1.2
Returns
-------
DataFrame or TextParser
A comma-separated values (csv) file is returned as two-dimensional
data structure with labeled axes.
See Also
--------
DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
read_csv : Read a comma-separated values (csv) file into DataFrame.
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
Examples
--------
>>> pd.read_csv('data.csv') # doctest: +SKIP
!cat 2ddata.txt
11,12,13 21,22,23 31,32,33
pd.read_csv("2ddata.txt", header=None)
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | 11 | 12 | 13 |
| 1 | 21 | 22 | 23 |
| 2 | 31 | 32 | 33 |
pd.read_csv("2ddata.txt", header=None, names=["col1", "col2", "col3"])
| col1 | col2 | col3 | |
|---|---|---|---|
| 0 | 11 | 12 | 13 |
| 1 | 21 | 22 | 23 |
| 2 | 31 | 32 | 33 |
wallet
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... | ... |
| 95 | 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 5 columns
wallet.head() # method from dataframe
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
wallet.head(10)
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| 5 | 5 | 2021-07-28 19:53:28.377359 | Utility | Electricity | 219.941711 |
| 6 | 6 | 2021-04-16 11:53:28.377359 | Books | Amazon Kindle | 270.322595 |
| 7 | 7 | 2021-02-15 10:53:28.377359 | Food | Zomato | 457.183104 |
| 8 | 8 | 2021-08-10 19:53:28.377359 | Utility | Phone | 151.496373 |
| 9 | 9 | 2020-11-29 14:53:28.377359 | Travel | Auto | 443.618884 |
wallet.tail()
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 95 | 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
wallet.describe() # basic stats of numeric columns
| Unnamed: 0 | debit | |
|---|---|---|
| count | 100.000000 | 100.000000 |
| mean | 49.500000 | 310.595905 |
| std | 29.011492 | 121.178218 |
| min | 0.000000 | 100.455501 |
| 25% | 24.750000 | 216.429447 |
| 50% | 49.500000 | 318.110776 |
| 75% | 74.250000 | 424.109079 |
| max | 99.000000 | 499.858182 |
try this
https://www.moneycontrol.com/markets/indian-indices/moneycontrolurl = "https://www.moneycontrol.com/markets/indian-indices/"
pd.read_html(moneycontrolurl)
[ Stock Name Sector LTP Change %Chg
0 IndusInd Bank Banks - Private Sector 1072.1 -72.7 -6.35,
Stock Name Sector LTP Change %Chg
0 No Data No Data No Data No Data No Data,
Stock Name Sector LTP Change %Chg
0 Infosys Computers - Software 1449.50 -22.05 -1.50
1 M&M Auto - Cars & Jeeps 1211.15 -15.55 -1.27
2 Adani Ports Infrastructure - General 689.70 -8.10 -1.16
3 Tata Motors Auto - LCVs & HCVs 431.10 -4.75 -1.09
4 SBI Life Insura Life & Health Insurance 1085.05 -10.85 -0.99
5 Hero Motocorp Auto - 2 & 3 Wheelers 2416.60 -23.70 -0.97
6 BPCL Refineries 323.15 -2.90 -0.89
7 HCL Tech Computers - Software 1107.25 -9.05 -0.81
8 Coal India Mining & Minerals 222.60 -1.75 -0.78
9 Dr Reddys Labs Pharmaceuticals 4364.00 -31.55 -0.72
10 Maruti Suzuki Auto - Cars & Jeeps 8552.00 -49.55 -0.58
11 HDFC Life Life & Health Insurance 487.90 -2.35 -0.48
12 TATA Cons. Prod Plantations - Tea & Coffee 701.25 -3.15 -0.45
13 Wipro Computers - Software 387.60 -1.50 -0.39
14 Bajaj Finserv Finance - Investments 1324.35 -4.50 -0.34
15 Eicher Motors Auto - LCVs & HCVs 3106.55 -9.55 -0.31
16 SBI Banks - Public Sector 545.75 -1.60 -0.29
17 Nestle Food Processing 18432.10 -53.65 -0.29
18 TCS Computers - Software 3321.55 -9.45 -0.28
19 Divis Labs Pharmaceuticals 2780.00 -7.50 -0.27
20 Titan Company Miscellaneous 2369.30 -6.20 -0.26
21 ITC Diversified 387.00 -1.00 -0.26
22 Sun Pharma Pharmaceuticals 953.25 -2.30 -0.24
23 Tata Steel Iron & Steel 107.85 -0.25 -0.23
24 Axis Bank Banks - Private Sector 850.85 -1.05 -0.12
25 NTPC Power - Generation & Distribution 180.55 -0.20 -0.11
26 Bajaj Auto Auto - 2 & 3 Wheelers 3819.25 -2.65 -0.07,
Stock Name Sector LTP Change %Chg
0 Adani Enterpris Trading 1925.95 29.75 1.57
1 ICICI Bank Banks - Private Sector 852.40 9.75 1.16
2 Apollo Hospital Hospitals & Medical Services 4356.60 39.35 0.91
3 Grasim Diversified 1608.20 12.30 0.77
4 JSW Steel Steel - Large 683.80 4.40 0.65
5 ONGC Oil Drilling And Exploration 156.80 1.00 0.64
6 Power Grid Corp Power - Generation & Distribution 227.85 1.30 0.57
7 Cipla Pharmaceuticals 886.15 4.95 0.56
8 HDFC Bank Banks - Private Sector 1597.05 8.40 0.53
9 HUL Personal Care 2478.15 12.70 0.52
10 Britannia Food Processing 4331.30 20.30 0.47
11 Kotak Mahindra Banks - Private Sector 1707.00 7.70 0.45
12 Reliance Refineries 2331.65 8.95 0.39
13 Bajaj Finance Finance - NBFC 5897.20 23.05 0.39
14 HDFC Finance - Housing 2618.30 9.40 0.36
15 Hindalco Iron & Steel 406.90 1.25 0.31
16 UPL Chemicals 712.55 0.90 0.13
17 Larsen Infrastructure - General 2160.15 2.30 0.11
18 Bharti Airtel Telecommunications - Service 774.35 0.60 0.08
19 UltraTechCement Cement - Major 7226.60 0.20 0.00
20 Asian Paints Paints & Varnishes 2830.10 -0.10 0.00,
Stock Name Sector LTP Change %Chg
0 No Data No Data No Data No Data No Data,
Stock Name Sector LTP Change %Chg
0 Tech Mahindra Computers - Software 1141.05 79.8 7.52]
data = pd.read_html(moneycontrolurl)
type(data)
list
len(data)
6
data[0]
| Stock Name | Sector | LTP | Change | %Chg | |
|---|---|---|---|---|---|
| 0 | IndusInd Bank | Banks - Private Sector | 1072.2 | -72.6 | -6.34 |
data[1]
| Stock Name | Sector | LTP | Change | %Chg | |
|---|---|---|---|---|---|
| 0 | No Data | No Data | No Data | No Data | No Data |
data[2]
| Stock Name | Sector | LTP | Change | %Chg | |
|---|---|---|---|---|---|
| 0 | Infosys | Computers - Software | 1448.65 | -22.90 | -1.56 |
| 1 | M&M | Auto - Cars & Jeeps | 1210.00 | -16.70 | -1.36 |
| 2 | Adani Ports | Infrastructure - General | 689.00 | -8.80 | -1.26 |
| 3 | Tata Motors | Auto - LCVs & HCVs | 431.15 | -4.70 | -1.08 |
| 4 | Hero Motocorp | Auto - 2 & 3 Wheelers | 2415.15 | -25.15 | -1.03 |
| 5 | SBI Life Insura | Life & Health Insurance | 1085.20 | -10.70 | -0.98 |
| 6 | BPCL | Refineries | 323.00 | -3.05 | -0.94 |
| 7 | HCL Tech | Computers - Software | 1106.90 | -9.40 | -0.84 |
| 8 | Coal India | Mining & Minerals | 222.55 | -1.80 | -0.80 |
| 9 | Dr Reddys Labs | Pharmaceuticals | 4361.00 | -34.55 | -0.79 |
| 10 | Maruti Suzuki | Auto - Cars & Jeeps | 8550.00 | -51.55 | -0.60 |
| 11 | TATA Cons. Prod | Plantations - Tea & Coffee | 700.55 | -3.85 | -0.55 |
| 12 | HDFC Life | Life & Health Insurance | 487.75 | -2.50 | -0.51 |
| 13 | Wipro | Computers - Software | 387.25 | -1.85 | -0.48 |
| 14 | Eicher Motors | Auto - LCVs & HCVs | 3103.30 | -12.80 | -0.41 |
| 15 | SBI | Banks - Public Sector | 545.40 | -1.95 | -0.36 |
| 16 | ITC | Diversified | 386.60 | -1.40 | -0.36 |
| 17 | Titan Company | Miscellaneous | 2367.95 | -7.55 | -0.32 |
| 18 | Bajaj Finserv | Finance - Investments | 1324.55 | -4.30 | -0.32 |
| 19 | Nestle | Food Processing | 18430.00 | -55.75 | -0.30 |
| 20 | Divis Labs | Pharmaceuticals | 2779.50 | -8.00 | -0.29 |
| 21 | Tata Steel | Iron & Steel | 107.80 | -0.30 | -0.28 |
| 22 | TCS | Computers - Software | 3322.80 | -8.20 | -0.25 |
| 23 | Sun Pharma | Pharmaceuticals | 953.30 | -2.25 | -0.24 |
| 24 | NTPC | Power - Generation & Distribution | 180.35 | -0.40 | -0.22 |
| 25 | Bajaj Auto | Auto - 2 & 3 Wheelers | 3816.20 | -5.70 | -0.15 |
| 26 | Axis Bank | Banks - Private Sector | 850.90 | -1.00 | -0.12 |
| 27 | UltraTechCement | Cement - Major | 7218.00 | -8.40 | -0.12 |
| 28 | Bharti Airtel | Telecommunications - Service | 773.25 | -0.50 | -0.06 |
| 29 | Larsen | Infrastructure - General | 2156.75 | -1.10 | -0.05 |
data[3]
| Stock Name | Sector | LTP | Change | %Chg | |
|---|---|---|---|---|---|
| 0 | Adani Enterpris | Trading | 1921.00 | 24.80 | 1.31 |
| 1 | ICICI Bank | Banks - Private Sector | 851.80 | 9.15 | 1.09 |
| 2 | Apollo Hospital | Hospitals & Medical Services | 4355.00 | 37.75 | 0.87 |
| 3 | Grasim | Diversified | 1606.50 | 10.60 | 0.66 |
| 4 | JSW Steel | Steel - Large | 683.45 | 4.05 | 0.60 |
| 5 | ONGC | Oil Drilling And Exploration | 156.70 | 0.90 | 0.58 |
| 6 | Cipla | Pharmaceuticals | 886.20 | 5.00 | 0.57 |
| 7 | Power Grid Corp | Power - Generation & Distribution | 227.80 | 1.25 | 0.55 |
| 8 | Kotak Mahindra | Banks - Private Sector | 1707.00 | 7.70 | 0.45 |
| 9 | Britannia | Food Processing | 4330.00 | 19.00 | 0.44 |
| 10 | HDFC Bank | Banks - Private Sector | 1595.20 | 6.55 | 0.41 |
| 11 | HUL | Personal Care | 2475.30 | 9.85 | 0.40 |
| 12 | Reliance | Refineries | 2331.05 | 8.35 | 0.36 |
| 13 | Bajaj Finance | Finance - NBFC | 5893.65 | 19.50 | 0.33 |
| 14 | HDFC | Finance - Housing | 2615.30 | 6.40 | 0.25 |
| 15 | Hindalco | Iron & Steel | 406.55 | 0.90 | 0.22 |
| 16 | UPL | Chemicals | 712.30 | 0.65 | 0.09 |
| 17 | Asian Paints | Paints & Varnishes | 2830.15 | -0.05 | 0.00 |
DataFrame has columns, and every column is a series. And every series has row and labels
wallet.columns # this has names of columns
Index(['Unnamed: 0', 'date', 'category', 'description', 'debit'], dtype='object')
!head wallet.csv
,date,category,description,debit 0,2021-03-07 14:53:28.377359,Music,Amazon,421.2073272347991 1,2020-10-08 09:53:28.377359,Food,Swiggy,328.4400802428426 2,2021-02-23 09:53:28.377359,Books,Amazon,244.67943701511354 3,2020-11-01 14:53:28.377359,Utility,Phone,222.7563175805277 4,2021-06-05 13:53:28.377359,Books,Flipcart,494.1284923793595 5,2021-07-28 19:53:28.377359,Utility,Electricity,219.94171130968408 6,2021-04-16 11:53:28.377359,Books,Amazon Kindle,270.32259514795845 7,2021-02-15 10:53:28.377359,Food,Zomato,457.1831036346536 8,2021-08-10 19:53:28.377359,Utility,Phone,151.49637259947792
wallet.debit # access column like an attribute inside the class
0 421.207327
1 328.440080
2 244.679437
3 222.756318
4 494.128492
...
95 388.671213
96 467.554562
97 320.789434
98 442.096469
99 100.455501
Name: debit, Length: 100, dtype: float64
type(wallet.debit)
pandas.core.series.Series
wallet['debit'] # can access columns in dictionary fashion
0 421.207327
1 328.440080
2 244.679437
3 222.756318
4 494.128492
...
95 388.671213
96 467.554562
97 320.789434
98 442.096469
99 100.455501
Name: debit, Length: 100, dtype: float64
wallet.columns #just names of columns in this dataframe.. it like a list
Index(['Unnamed: 0', 'date', 'category', 'description', 'debit'], dtype='object')
wallet.columns[0]
'Unnamed: 0'
Variable names in python can not have space in it
wallet.Unnamed: 0 # this is not possible because of space (special char)
wallet['Unnamed: 0']
0 0
1 1
2 2
3 3
4 4
..
95 95
96 96
97 97
98 98
99 99
Name: Unnamed: 0, Length: 100, dtype: int64
wallet.debit
0 421.207327
1 328.440080
2 244.679437
3 222.756318
4 494.128492
...
95 388.671213
96 467.554562
97 320.789434
98 442.096469
99 100.455501
Name: debit, Length: 100, dtype: float64
wallet.debit.sum()
31059.590543177277
wallet.debit.mean()
310.5959054317728
wallet.debit.describe()
count 100.000000 mean 310.595905 std 121.178218 min 100.455501 25% 216.429447 50% 318.110776 75% 424.109079 max 499.858182 Name: debit, dtype: float64
%%file missing.csv
A,B,C
11,,13
21,22,23
31,32,33
Overwriting missing.csv
missing_data = pd.read_csv("missing.csv")
missing_data
| A | B | C | |
|---|---|---|---|
| 0 | 11 | NaN | 13 |
| 1 | 21 | 22.0 | 23 |
| 2 | 31 | 32.0 | 33 |
missing_data.B
0 NaN 1 22.0 2 32.0 Name: B, dtype: float64
missing_data.B.sum() # this will internally ignore rows with NaN (or missing values)
54.0
missing_data.describe()
| A | B | C | |
|---|---|---|---|
| count | 3.0 | 2.000000 | 3.0 |
| mean | 21.0 | 27.000000 | 23.0 |
| std | 10.0 | 7.071068 | 10.0 |
| min | 11.0 | 22.000000 | 13.0 |
| 25% | 16.0 | 24.500000 | 18.0 |
| 50% | 21.0 | 27.000000 | 23.0 |
| 75% | 26.0 | 29.500000 | 28.0 |
| max | 31.0 | 32.000000 | 33.0 |
If we give only list of data to create series, then it is alomost like a list but with additional feature of series
s1 = pd.Series([420,320,120,220])
s1
0 420 1 320 2 120 3 220 dtype: int64
s1[0]
420
s1[3]
220
[i*i for i in s1]
[176400, 102400, 14400, 48400]
s1.sum()
1080
s1.head()
0 420 1 320 2 120 3 220 dtype: int64
s1.tail()
0 420 1 320 2 120 3 220 dtype: int64
stocks = pd.Series([420,320,120,220],
index=["APPLE","AT&T","IBM","NIKE"])
stocks # this series has labels to rows
APPLE 420 AT&T 320 IBM 120 NIKE 220 dtype: int64
stocks['APPLE']
420
stocks[0]
420
[s for s in stocks]
[420, 320, 120, 220]
missing_data
| A | B | C | |
|---|---|---|---|
| 0 | 11 | NaN | 13 |
| 1 | 21 | 22.0 | 23 |
| 2 | 31 | 32.0 | 33 |
missing_data.A
0 11 1 21 2 31 Name: A, dtype: int64
missing_data.B
0 NaN 1 22.0 2 32.0 Name: B, dtype: float64
missing_data.C
0 13 1 23 2 33 Name: C, dtype: int64
stocksdf = pd.DataFrame({"value": [420.0, 320.5, 120.1, 220.0],
"high": [423, 322, 125, 227.0],
"low": [419, 318, 115, 217.4],
"volume": [100, 200, 50, 1000]},
index = ["APPLE","AT&T","IBM","NIKE"])
stocksdf
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 |
| IBM | 120.1 | 125.0 | 115.0 | 50 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 |
stocksdf.value
APPLE 420.0 AT&T 320.5 IBM 120.1 NIKE 220.0 Name: value, dtype: float64
stocksdf.high
APPLE 423.0 AT&T 322.0 IBM 125.0 NIKE 227.0 Name: high, dtype: float64
stocksdf.volume
APPLE 100 AT&T 200 IBM 50 NIKE 1000 Name: volume, dtype: int64
stocksdf['APPLE']
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File ~/usr/local/default/lib/python3.10/site-packages/pandas/core/indexes/base.py:3803, in Index.get_loc(self, key, method, tolerance) 3802 try: -> 3803 return self._engine.get_loc(casted_key) 3804 except KeyError as err: File ~/usr/local/default/lib/python3.10/site-packages/pandas/_libs/index.pyx:138, in pandas._libs.index.IndexEngine.get_loc() File ~/usr/local/default/lib/python3.10/site-packages/pandas/_libs/index.pyx:165, in pandas._libs.index.IndexEngine.get_loc() File pandas/_libs/hashtable_class_helper.pxi:5745, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas/_libs/hashtable_class_helper.pxi:5753, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'APPLE' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Cell In[92], line 1 ----> 1 stocksdf['APPLE'] File ~/usr/local/default/lib/python3.10/site-packages/pandas/core/frame.py:3805, in DataFrame.__getitem__(self, key) 3803 if self.columns.nlevels > 1: 3804 return self._getitem_multilevel(key) -> 3805 indexer = self.columns.get_loc(key) 3806 if is_integer(indexer): 3807 indexer = [indexer] File ~/usr/local/default/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key, method, tolerance) 3803 return self._engine.get_loc(casted_key) 3804 except KeyError as err: -> 3805 raise KeyError(key) from err 3806 except TypeError: 3807 # If we have a listlike key, _check_indexing_error will raise 3808 # InvalidIndexError. Otherwise we fall through and re-raise 3809 # the TypeError. 3810 self._check_indexing_error(key) KeyError: 'APPLE'
stocksdf.loc['APPLE'] # row with label 'APPLE'
value 420.0 high 423.0 low 419.0 volume 100.0 Name: APPLE, dtype: float64
stocksdf
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 |
| IBM | 120.1 | 125.0 | 115.0 | 50 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 |
stocksdf.iloc[0]
value 420.0 high 423.0 low 419.0 volume 100.0 Name: APPLE, dtype: float64
stocksdf.loc[['APPLE','IBM']]
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 |
| IBM | 120.1 | 125.0 | 115.0 | 50 |
wallet
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... | ... |
| 95 | 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 5 columns
wallet = pd.read_csv("wallet.csv", index_col=0) # you can specify index column, it can be a number or a column name
wallet
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... |
| 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 4 columns
stocksdf.to_csv("stocks.csv")
pd.read_csv("stocks.csv") # by defaulf label is not taken from file, it is take as row number
| Unnamed: 0 | value | high | low | volume | |
|---|---|---|---|---|---|
| 0 | APPLE | 420.0 | 423.0 | 419.0 | 100 |
| 1 | AT&T | 320.5 | 322.0 | 318.0 | 200 |
| 2 | IBM | 120.1 | 125.0 | 115.0 | 50 |
| 3 | NIKE | 220.0 | 227.0 | 217.4 | 1000 |
pd.read_csv("stocks.csv", index_col=0)
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 |
| IBM | 120.1 | 125.0 | 115.0 | 50 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 |
!cat stocks.csv
,value,high,low,volume APPLE,420.0,423.0,419.0,100 AT&T,320.5,322.0,318.0,200 IBM,120.1,125.0,115.0,50 NIKE,220.0,227.0,217.4,1000
%%file stocks_ticker.csv
value,ticker,high,low,volume
420.0,APPLE,423.0,419.0,100
320.5,AT&T,322.0,318.0,200
120.1,IBM,125.0,115.0,50
220.0,NIKE,227.0,217.4,1000
Writing stocks_ticker.csv
pd.read_csv("stocks_ticker.csv")
| value | ticker | high | low | volume | |
|---|---|---|---|---|---|
| 0 | 420.0 | APPLE | 423.0 | 419.0 | 100 |
| 1 | 320.5 | AT&T | 322.0 | 318.0 | 200 |
| 2 | 120.1 | IBM | 125.0 | 115.0 | 50 |
| 3 | 220.0 | NIKE | 227.0 | 217.4 | 1000 |
pd.read_csv("stocks_ticker.csv", index_col="ticker")
| value | high | low | volume | |
|---|---|---|---|---|
| ticker | ||||
| APPLE | 420.0 | 423.0 | 419.0 | 100 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 |
| IBM | 120.1 | 125.0 | 115.0 | 50 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 |
wallet
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... |
| 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 4 columns
wallet.debit > 300
s1
0 420 1 320 2 120 3 220 dtype: int64
s1 > 200 # this is like a vector operation
0 True 1 True 2 False 3 True dtype: bool
[1, 1, 1] + [1, 1, 1] # concatenates
[1, 1, 1, 1, 1, 1]
pd.Series([1, 1, 1]) + pd.Series([1, 1, 1]) # vector addition
0 2 1 2 2 2 dtype: int64
s1 + s1
0 840 1 640 2 240 3 440 dtype: int64
s1 - s1
0 0 1 0 2 0 3 0 dtype: int64
s1 * 100
0 42000 1 32000 2 12000 3 22000 dtype: int64
s1 > 300
0 True 1 True 2 False 3 False dtype: bool
wallet.debit > 300
0 True
1 True
2 False
3 False
4 True
...
95 True
96 True
97 True
98 True
99 False
Name: debit, Length: 100, dtype: bool
greaterthan300 = wallet.debit > 300
greaterthan300.sum() # True -> 1, False = 0
55
wallet.debit[greaterthan300] # this works a filter... it removes rows which has False and keeps rows with has True
0 421.207327 1 328.440080 4 494.128492 7 457.183104 9 443.618884 10 328.175421 11 434.495468 12 329.536003 14 485.297743 16 390.316876 17 316.878675 18 433.822404 20 365.921808 21 329.097372 22 489.143483 23 354.940241 24 498.100496 25 483.315864 28 358.459933 32 345.030436 33 449.248030 34 499.858182 35 441.602143 36 472.941439 38 494.124399 41 425.187191 43 486.033933 46 359.324972 47 310.408610 48 310.058410 50 463.001875 51 331.227023 52 403.610070 55 423.749708 56 319.342876 60 382.519510 62 497.770860 63 355.989050 67 324.786917 69 300.524620 74 303.055421 75 425.625291 77 482.152343 78 415.372894 79 321.763416 81 470.080995 82 411.142701 83 451.584407 87 378.820641 92 356.842638 94 431.185537 95 388.671213 96 467.554562 97 320.789434 98 442.096469 Name: debit, dtype: float64
s1
0 420 1 320 2 120 3 220 dtype: int64
s1[s1 > 200]
0 420 1 320 3 220 dtype: int64
s1[s1 <= 200]
2 120 dtype: int64
wallet[wallet.debit>300]
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| 7 | 2021-02-15 10:53:28.377359 | Food | Zomato | 457.183104 |
| 9 | 2020-11-29 14:53:28.377359 | Travel | Auto | 443.618884 |
| 10 | 2021-06-15 13:53:28.377359 | Travel | Metro | 328.175421 |
| 11 | 2021-07-24 13:53:28.377359 | Food | Zomato | 434.495468 |
| 12 | 2021-07-24 14:53:28.377359 | Music | Amazon | 329.536003 |
| 14 | 2021-06-09 13:53:28.377359 | Travel | Taxi | 485.297743 |
| 16 | 2021-03-05 19:53:28.377359 | Utility | Phone | 390.316876 |
| 17 | 2021-04-17 18:53:28.377359 | Utility | Electricity | 316.878675 |
| 18 | 2021-05-08 15:53:28.377359 | Travel | Auto | 433.822404 |
| 20 | 2020-10-12 18:53:28.377359 | Travel | Auto | 365.921808 |
| 21 | 2021-01-04 19:53:28.377359 | Travel | Metro | 329.097372 |
| 22 | 2021-06-24 15:53:28.377359 | Food | Zomato | 489.143483 |
| 23 | 2020-12-11 10:53:28.377359 | Music | Netflix | 354.940241 |
| 24 | 2021-05-31 11:53:28.377359 | Books | Amazon | 498.100496 |
| 25 | 2021-05-21 14:53:28.377359 | Food | Hotel | 483.315864 |
| 28 | 2020-12-14 15:53:28.377359 | Utility | Phone | 358.459933 |
| 32 | 2021-06-23 11:53:28.377359 | Food | Zomato | 345.030436 |
| 33 | 2021-05-14 18:53:28.377359 | Food | Hotel | 449.248030 |
| 34 | 2021-05-14 10:53:28.377359 | Utility | Phone | 499.858182 |
| 35 | 2021-02-18 18:53:28.377359 | Travel | Metro | 441.602143 |
| 36 | 2020-12-10 10:53:28.377359 | Travel | Auto | 472.941439 |
| 38 | 2021-08-15 10:53:28.377359 | Travel | Auto | 494.124399 |
| 41 | 2021-02-20 19:53:28.377359 | Utility | Phone | 425.187191 |
| 43 | 2020-09-21 12:53:28.377359 | Utility | Phone | 486.033933 |
| 46 | 2021-05-15 15:53:28.377359 | Utility | Electricity | 359.324972 |
| 47 | 2020-10-28 10:53:28.377359 | Books | Flipcart | 310.408610 |
| 48 | 2021-08-23 17:53:28.377359 | Utility | Electricity | 310.058410 |
| 50 | 2020-12-24 11:53:28.377359 | Food | Zomato | 463.001875 |
| 51 | 2020-12-22 17:53:28.377359 | Food | Zomato | 331.227023 |
| 52 | 2021-03-26 09:53:28.377359 | Travel | Taxi | 403.610070 |
| 55 | 2021-01-21 19:53:28.377359 | Books | Flipcart | 423.749708 |
| 56 | 2021-05-19 18:53:28.377359 | Utility | Phone | 319.342876 |
| 60 | 2021-01-17 11:53:28.377359 | Utility | Electricity | 382.519510 |
| 62 | 2021-01-27 19:53:28.377359 | Books | Amazon Kindle | 497.770860 |
| 63 | 2021-05-10 11:53:28.377359 | Travel | Taxi | 355.989050 |
| 67 | 2021-07-31 14:53:28.377359 | Music | Netflix | 324.786917 |
| 69 | 2020-10-10 15:53:28.377359 | Utility | Electricity | 300.524620 |
| 74 | 2021-03-20 11:53:28.377359 | Travel | Taxi | 303.055421 |
| 75 | 2021-03-03 12:53:28.377359 | Food | Hotel | 425.625291 |
| 77 | 2021-01-18 14:53:28.377359 | Books | Amazon Kindle | 482.152343 |
| 78 | 2020-09-09 16:53:28.377359 | Music | spotify | 415.372894 |
| 79 | 2021-08-17 09:53:28.377359 | Music | Netflix | 321.763416 |
| 81 | 2020-10-29 16:53:28.377359 | Food | Hotel | 470.080995 |
| 82 | 2020-09-22 09:53:28.377359 | Music | spotify | 411.142701 |
| 83 | 2021-03-18 09:53:28.377359 | Books | Flipcart | 451.584407 |
| 87 | 2021-05-19 15:53:28.377359 | Food | Zomato | 378.820641 |
| 92 | 2021-01-12 19:53:28.377359 | Travel | Taxi | 356.842638 |
| 94 | 2021-01-04 13:53:28.377359 | Utility | Phone | 431.185537 |
| 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
wallet[wallet.category=="Books"]
| date | category | description | debit | |
|---|---|---|---|---|
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| 6 | 2021-04-16 11:53:28.377359 | Books | Amazon Kindle | 270.322595 |
| 19 | 2021-05-16 10:53:28.377359 | Books | Flipcart | 109.325909 |
| 24 | 2021-05-31 11:53:28.377359 | Books | Amazon | 498.100496 |
| 26 | 2020-08-26 15:53:28.377359 | Books | Amazon Kindle | 138.806578 |
| 31 | 2020-09-25 11:53:28.377359 | Books | Flipcart | 246.503527 |
| 47 | 2020-10-28 10:53:28.377359 | Books | Flipcart | 310.408610 |
| 55 | 2021-01-21 19:53:28.377359 | Books | Flipcart | 423.749708 |
| 62 | 2021-01-27 19:53:28.377359 | Books | Amazon Kindle | 497.770860 |
| 72 | 2021-06-30 18:53:28.377359 | Books | Amazon | 294.662869 |
| 77 | 2021-01-18 14:53:28.377359 | Books | Amazon Kindle | 482.152343 |
| 83 | 2021-03-18 09:53:28.377359 | Books | Flipcart | 451.584407 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
problem
compute_expenses to compute expenses for given category.>>> wallet = pd.read_csv("wallet.csv", index_col=0)
>>> compute_expenses(waller, "Books")
??
def compute_expenses(dataframe, category):
return dataframe[dataframe.category == category]['debit'].sum()
compute_expenses(wallet, "Books")
4929.750393283798
compute_expenses(wallet, "Food")
8281.189172581233
compute_expenses(wallet, "Travel")
6052.931876440963
wallet = pd.read_csv("wallet.csv")
wallet
| Unnamed: 0 | date | category | description | debit | |
|---|---|---|---|---|---|
| 0 | 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... | ... |
| 95 | 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 5 columns
del wallet['Unnamed: 0']
wallet
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... |
| 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 4 columns
stocksdf
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 |
| IBM | 120.1 | 125.0 | 115.0 | 50 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 |
total_asset = stocksdf['value'] * stocksdf['volume']
total_asset
APPLE 42000.0 AT&T 64100.0 IBM 6005.0 NIKE 220000.0 dtype: float64
stocksdf['total_asset'] = total_asset
stocksdf
| value | high | low | volume | total_asset | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 | 42000.0 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 | 64100.0 |
| IBM | 120.1 | 125.0 | 115.0 | 50 | 6005.0 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 | 220000.0 |
s1 = pd.Series([2, 3, 4, 5] , index=["a","b","c","d"])
s1
a 2 b 3 c 4 d 5 dtype: int64
s2 = pd.Series([1, 2, 3, 4, 5], index=["e","d","c","b","a"])
s1
a 2 b 3 c 4 d 5 dtype: int64
s2
e 1 d 2 c 3 b 4 a 5 dtype: int64
s1 + s2
a 7.0 b 7.0 c 7.0 d 7.0 e NaN dtype: float64
stocksdf
| value | high | low | volume | total_asset | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 | 42000.0 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 | 64100.0 |
| IBM | 120.1 | 125.0 | 115.0 | 50 | 6005.0 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 | 220000.0 |
stocksdf.columns
Index(['value', 'high', 'low', 'volume', 'total_asset'], dtype='object')
reordered_cols = stocksdf[['value', 'total_asset', 'high', 'low', 'volume']]
reordered_cols
| value | total_asset | high | low | volume | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 42000.0 | 423.0 | 419.0 | 100 |
| AT&T | 320.5 | 64100.0 | 322.0 | 318.0 | 200 |
| IBM | 120.1 | 6005.0 | 125.0 | 115.0 | 50 |
| NIKE | 220.0 | 220000.0 | 227.0 | 217.4 | 1000 |
stocksdf.rename(columns= dict(zip(['value', 'high', 'low', 'volume', 'total_asset'],
['Value', 'High', 'Low', 'Volume', 'Total_Asset'])))
| Value | High | Low | Volume | Total_Asset | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 | 42000.0 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 | 64100.0 |
| IBM | 120.1 | 125.0 | 115.0 | 50 | 6005.0 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 | 220000.0 |
stocksdf
| value | high | low | volume | total_asset | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 | 42000.0 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 | 64100.0 |
| IBM | 120.1 | 125.0 | 115.0 | 50 | 6005.0 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 | 220000.0 |
df = stocksdf.rename(columns= dict(zip(['value', 'high', 'low', 'volume', 'total_asset'],
['Value', 'High', 'Low', 'Volume', 'Total_Asset'])))
df
| Value | High | Low | Volume | Total_Asset | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 | 42000.0 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 | 64100.0 |
| IBM | 120.1 | 125.0 | 115.0 | 50 | 6005.0 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 | 220000.0 |
df.rename(columns={"Value":"Hello"})
| Hello | High | Low | Volume | Total_Asset | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 | 42000.0 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 | 64100.0 |
| IBM | 120.1 | 125.0 | 115.0 | 50 | 6005.0 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 | 220000.0 |
food = wallet[wallet.category=="Food"]
wallet
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... |
| 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 4 columns
food
| date | category | description | debit | |
|---|---|---|---|---|
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 7 | 2021-02-15 10:53:28.377359 | Food | Zomato | 457.183104 |
| 11 | 2021-07-24 13:53:28.377359 | Food | Zomato | 434.495468 |
| 15 | 2021-08-24 17:53:28.377359 | Food | Zomato | 262.943993 |
| 22 | 2021-06-24 15:53:28.377359 | Food | Zomato | 489.143483 |
| 25 | 2021-05-21 14:53:28.377359 | Food | Hotel | 483.315864 |
| 30 | 2020-09-15 18:53:28.377359 | Food | Swiggy | 203.529240 |
| 32 | 2021-06-23 11:53:28.377359 | Food | Zomato | 345.030436 |
| 33 | 2021-05-14 18:53:28.377359 | Food | Hotel | 449.248030 |
| 39 | 2021-05-17 17:53:28.377359 | Food | Swiggy | 112.333160 |
| 40 | 2021-07-19 12:53:28.377359 | Food | Swiggy | 291.545988 |
| 42 | 2021-08-22 17:53:28.377359 | Food | Hotel | 210.256270 |
| 50 | 2020-12-24 11:53:28.377359 | Food | Zomato | 463.001875 |
| 51 | 2020-12-22 17:53:28.377359 | Food | Zomato | 331.227023 |
| 58 | 2021-05-20 10:53:28.377359 | Food | Hotel | 255.871035 |
| 59 | 2020-08-28 11:53:28.377359 | Food | Swiggy | 208.232912 |
| 61 | 2021-02-25 13:53:28.377359 | Food | Hotel | 124.658278 |
| 64 | 2021-01-31 14:53:28.377359 | Food | Zomato | 232.222380 |
| 66 | 2020-10-09 16:53:28.377359 | Food | Swiggy | 263.957770 |
| 71 | 2021-03-30 12:53:28.377359 | Food | Swiggy | 245.360508 |
| 75 | 2021-03-03 12:53:28.377359 | Food | Hotel | 425.625291 |
| 80 | 2021-02-17 09:53:28.377359 | Food | Swiggy | 283.095707 |
| 81 | 2020-10-29 16:53:28.377359 | Food | Hotel | 470.080995 |
| 86 | 2021-05-07 16:53:28.377359 | Food | Zomato | 198.450672 |
| 87 | 2021-05-19 15:53:28.377359 | Food | Zomato | 378.820641 |
| 91 | 2021-01-22 17:53:28.377359 | Food | Hotel | 232.663468 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
stocksdf[['value', 'total_asset', 'high', 'low', 'volume']]
| value | total_asset | high | low | volume | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 42000.0 | 423.0 | 419.0 | 100 |
| AT&T | 320.5 | 64100.0 | 322.0 | 318.0 | 200 |
| IBM | 120.1 | 6005.0 | 125.0 | 115.0 | 50 |
| NIKE | 220.0 | 220000.0 | 227.0 | 217.4 | 1000 |
stocksdf
| value | high | low | volume | total_asset | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 | 42000.0 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 | 64100.0 |
| IBM | 120.1 | 125.0 | 115.0 | 50 | 6005.0 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 | 220000.0 |
wallet = pd.read_csv("wallet.csv", index_col=0)
wallet
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... |
| 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 4 columns
wallet.category
0 Music
1 Food
2 Books
3 Utility
4 Books
...
95 Utility
96 Books
97 Utility
98 Travel
99 Food
Name: category, Length: 100, dtype: object
wallet.describe()
| debit | |
|---|---|
| count | 100.000000 |
| mean | 310.595905 |
| std | 121.178218 |
| min | 100.455501 |
| 25% | 216.429447 |
| 50% | 318.110776 |
| 75% | 424.109079 |
| max | 499.858182 |
wallet.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 100 entries, 0 to 99 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 100 non-null object 1 category 100 non-null object 2 description 100 non-null object 3 debit 100 non-null float64 dtypes: float64(1), object(3) memory usage: 3.9+ KB
wallet.category.str.upper() # this will apply this string method to every row from this column
0 MUSIC
1 FOOD
2 BOOKS
3 UTILITY
4 BOOKS
...
95 UTILITY
96 BOOKS
97 UTILITY
98 TRAVEL
99 FOOD
Name: category, Length: 100, dtype: object
wallet.category.str.lower()
0 music
1 food
2 books
3 utility
4 books
...
95 utility
96 books
97 utility
98 travel
99 food
Name: category, Length: 100, dtype: object
wallet.date
0 2021-03-07 14:53:28.377359
1 2020-10-08 09:53:28.377359
2 2021-02-23 09:53:28.377359
3 2020-11-01 14:53:28.377359
4 2021-06-05 13:53:28.377359
...
95 2021-07-19 13:53:28.377359
96 2021-01-12 19:53:28.377359
97 2021-03-25 11:53:28.377359
98 2021-05-13 15:53:28.377359
99 2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: object
wallet.debit < 300
0 False
1 False
2 True
3 True
4 False
...
95 False
96 False
97 False
98 False
99 True
Name: debit, Length: 100, dtype: bool
pd.to_datetime(wallet.date)
0 2021-03-07 14:53:28.377359
1 2020-10-08 09:53:28.377359
2 2021-02-23 09:53:28.377359
3 2020-11-01 14:53:28.377359
4 2021-06-05 13:53:28.377359
...
95 2021-07-19 13:53:28.377359
96 2021-01-12 19:53:28.377359
97 2021-03-25 11:53:28.377359
98 2021-05-13 15:53:28.377359
99 2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: datetime64[ns]
wallet.date
0 2021-03-07 14:53:28.377359
1 2020-10-08 09:53:28.377359
2 2021-02-23 09:53:28.377359
3 2020-11-01 14:53:28.377359
4 2021-06-05 13:53:28.377359
...
95 2021-07-19 13:53:28.377359
96 2021-01-12 19:53:28.377359
97 2021-03-25 11:53:28.377359
98 2021-05-13 15:53:28.377359
99 2020-10-11 16:53:28.377359
Name: date, Length: 100, dtype: object
import datetime
pd.to_datetime(wallet.date) > datetime.datetime(2020, 12, 31)
0 True
1 False
2 True
3 False
4 True
...
95 True
96 True
97 True
98 True
99 False
Name: date, Length: 100, dtype: bool
wallet['date'] = pd.to_datetime(wallet.date) # this has modified original dataframe
wallet.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 100 entries, 0 to 99 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 100 non-null datetime64[ns] 1 category 100 non-null object 2 description 100 non-null object 3 debit 100 non-null float64 dtypes: datetime64[ns](1), float64(1), object(2) memory usage: 3.9+ KB
wallet[wallet.date < datetime.datetime(2020, 12, 31)]
| date | category | description | debit | |
|---|---|---|---|---|
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 9 | 2020-11-29 14:53:28.377359 | Travel | Auto | 443.618884 |
| 20 | 2020-10-12 18:53:28.377359 | Travel | Auto | 365.921808 |
| 23 | 2020-12-11 10:53:28.377359 | Music | Netflix | 354.940241 |
| 26 | 2020-08-26 15:53:28.377359 | Books | Amazon Kindle | 138.806578 |
| 28 | 2020-12-14 15:53:28.377359 | Utility | Phone | 358.459933 |
| 30 | 2020-09-15 18:53:28.377359 | Food | Swiggy | 203.529240 |
| 31 | 2020-09-25 11:53:28.377359 | Books | Flipcart | 246.503527 |
| 36 | 2020-12-10 10:53:28.377359 | Travel | Auto | 472.941439 |
| 43 | 2020-09-21 12:53:28.377359 | Utility | Phone | 486.033933 |
| 44 | 2020-12-26 19:53:28.377359 | Utility | Electricity | 257.927593 |
| 47 | 2020-10-28 10:53:28.377359 | Books | Flipcart | 310.408610 |
| 50 | 2020-12-24 11:53:28.377359 | Food | Zomato | 463.001875 |
| 51 | 2020-12-22 17:53:28.377359 | Food | Zomato | 331.227023 |
| 54 | 2020-11-16 10:53:28.377359 | Music | spotify | 160.817543 |
| 59 | 2020-08-28 11:53:28.377359 | Food | Swiggy | 208.232912 |
| 65 | 2020-10-23 18:53:28.377359 | Music | Netflix | 188.748743 |
| 66 | 2020-10-09 16:53:28.377359 | Food | Swiggy | 263.957770 |
| 68 | 2020-08-26 09:53:28.377359 | Travel | Taxi | 279.147884 |
| 69 | 2020-10-10 15:53:28.377359 | Utility | Electricity | 300.524620 |
| 76 | 2020-11-17 09:53:28.377359 | Music | Netflix | 197.534600 |
| 78 | 2020-09-09 16:53:28.377359 | Music | spotify | 415.372894 |
| 81 | 2020-10-29 16:53:28.377359 | Food | Hotel | 470.080995 |
| 82 | 2020-09-22 09:53:28.377359 | Music | spotify | 411.142701 |
| 84 | 2020-09-21 10:53:28.377359 | Music | Netflix | 158.793646 |
| 90 | 2020-12-01 14:53:28.377359 | Music | Amazon | 101.573276 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
!cat stocks1.csv
value,ticker,high,low,volume 420.0,APPLE,423.0,419.0,100 320.5,AT&T,322.0,318.0,200 120.1,IBM,125.0,115.0,50 220.0,NIKE,227.0,217.4,1000
%%file stocks2.csv
value,ticker,high,low,volume
$420.0,APPLE,423.0,419.0,100
$320.5,AT&T,322.0,318.0,200
$120.1,IBM,125.0,115.0,50
$220.0,NIKE,227.0,217.4,1000
Writing stocks2.csv
stocks2 = pd.read_csv("stocks2.csv")
stocks2
| value | ticker | high | low | volume | |
|---|---|---|---|---|---|
| 0 | $420.0 | APPLE | 423.0 | 419.0 | 100 |
| 1 | $320.5 | AT&T | 322.0 | 318.0 | 200 |
| 2 | $120.1 | IBM | 125.0 | 115.0 | 50 |
| 3 | $220.0 | NIKE | 227.0 | 217.4 | 1000 |
stocks2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4 entries, 0 to 3 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 value 4 non-null object 1 ticker 4 non-null object 2 high 4 non-null float64 3 low 4 non-null float64 4 volume 4 non-null int64 dtypes: float64(2), int64(1), object(2) memory usage: 288.0+ bytes
stocks2.value* stocks2.volume
0 $420.0$420.0$420.0$420.0$420.0$420.0$420.0$420... 1 $320.5$320.5$320.5$320.5$320.5$320.5$320.5$320... 2 $120.1$120.1$120.1$120.1$120.1$120.1$120.1$120... 3 $220.0$220.0$220.0$220.0$220.0$220.0$220.0$220... dtype: object
"dfdsf" * 5
'dfdsfdfdsfdfdsfdfdsfdfdsf'
stocks2.value.str.replace("$","")
/tmp/ipykernel_37473/826872926.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
stocks2.value.str.replace("$","")
0 420.0 1 320.5 2 120.1 3 220.0 Name: value, dtype: object
stocks2['value'] = pd.to_numeric(stocks2.value.str.replace("$",""))
/tmp/ipykernel_37473/1359183244.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
stocks2['value'] = pd.to_numeric(stocks2.value.str.replace("$",""))
stocks2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4 entries, 0 to 3 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 value 4 non-null float64 1 ticker 4 non-null object 2 high 4 non-null float64 3 low 4 non-null float64 4 volume 4 non-null int64 dtypes: float64(3), int64(1), object(1) memory usage: 288.0+ bytes
stocks2.value* stocks2.volume
0 42000.0 1 64100.0 2 6005.0 3 220000.0 dtype: float64
%%file df.csv
A,B,C,D
a1,34.0,56,(343)
a2,32.0,78,(544)
a3,43.0,79,(545)
Writing df.csv
df = pd.read_csv("df.csv")
df
| A | B | C | D | |
|---|---|---|---|---|
| 0 | a1 | 34.0 | 56 | (343) |
| 1 | a2 | 32.0 | 78 | (544) |
| 2 | a3 | 43.0 | 79 | (545) |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3 entries, 0 to 2 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A 3 non-null object 1 B 3 non-null float64 2 C 3 non-null int64 3 D 3 non-null object dtypes: float64(1), int64(1), object(2) memory usage: 224.0+ bytes
col = df.D.str.replace("(","-")
/tmp/ipykernel_37473/223839002.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
col = df.D.str.replace("(","-")
col
0 -343) 1 -544) 2 -545) Name: D, dtype: object
col.str.replace(")","")
/tmp/ipykernel_37473/3983318474.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
col.str.replace(")","")
0 -343 1 -544 2 -545 Name: D, dtype: object
df['D'] = pd.to_numeric(col.str.replace(")",""))
/tmp/ipykernel_37473/3102900507.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df['D'] = pd.to_numeric(col.str.replace(")",""))
df
| A | B | C | D | |
|---|---|---|---|---|
| 0 | a1 | 34.0 | 56 | -343 |
| 1 | a2 | 32.0 | 78 | -544 |
| 2 | a3 | 43.0 | 79 | -545 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3 entries, 0 to 2 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A 3 non-null object 1 B 3 non-null float64 2 C 3 non-null int64 3 D 3 non-null int64 dtypes: float64(1), int64(2), object(1) memory usage: 224.0+ bytes
wallet = pd.read_csv("wallet.csv" , index_col=0)
wallet
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 1 | 2020-10-08 09:53:28.377359 | Food | Swiggy | 328.440080 |
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 3 | 2020-11-01 14:53:28.377359 | Utility | Phone | 222.756318 |
| 4 | 2021-06-05 13:53:28.377359 | Books | Flipcart | 494.128492 |
| ... | ... | ... | ... | ... |
| 95 | 2021-07-19 13:53:28.377359 | Utility | Phone | 388.671213 |
| 96 | 2021-01-12 19:53:28.377359 | Books | Flipcart | 467.554562 |
| 97 | 2021-03-25 11:53:28.377359 | Utility | Phone | 320.789434 |
| 98 | 2021-05-13 15:53:28.377359 | Travel | Taxi | 442.096469 |
| 99 | 2020-10-11 16:53:28.377359 | Food | Hotel | 100.455501 |
100 rows × 4 columns
wallet.groupby("description").sum()
/tmp/ipykernel_37473/4180037204.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
wallet.groupby("description").sum()
| debit | |
|---|---|
| description | |
| Amazon | 2504.690567 |
| Amazon Kindle | 1389.052376 |
| Auto | 2210.428935 |
| Electricity | 2885.064355 |
| Flipcart | 2503.255216 |
| Hotel | 2752.174732 |
| Metro | 1216.463665 |
| Netflix | 1546.567562 |
| Phone | 4677.202878 |
| Swiggy | 1936.495366 |
| Taxi | 2626.039276 |
| Zomato | 3592.519075 |
| spotify | 1219.636541 |
wallet.groupby("description")['debit'].max()
description Amazon 498.100496 Amazon Kindle 497.770860 Auto 494.124399 Electricity 382.519510 Flipcart 494.128492 Hotel 483.315864 Metro 441.602143 Netflix 354.940241 Phone 499.858182 Swiggy 328.440080 Taxi 485.297743 Zomato 489.143483 spotify 415.372894 Name: debit, dtype: float64
def find_max_in_shop(data, shopname):
maxvalues = data.groupby("description")['debit'].max()
return maxvalues.loc[shopname]
find_max_in_shop(wallet, "Flipcart")
494.1284923793595
max(["one", "two", "three"])
'two'
wallet[wallet.description=="Amazon"]
| date | category | description | debit | |
|---|---|---|---|---|
| 0 | 2021-03-07 14:53:28.377359 | Music | Amazon | 421.207327 |
| 2 | 2021-02-23 09:53:28.377359 | Books | Amazon | 244.679437 |
| 12 | 2021-07-24 14:53:28.377359 | Music | Amazon | 329.536003 |
| 24 | 2021-05-31 11:53:28.377359 | Books | Amazon | 498.100496 |
| 37 | 2021-04-18 16:53:28.377359 | Music | Amazon | 266.069078 |
| 72 | 2021-06-30 18:53:28.377359 | Books | Amazon | 294.662869 |
| 85 | 2021-01-12 09:53:28.377359 | Music | Amazon | 130.374908 |
| 89 | 2021-04-12 14:53:28.377359 | Music | Amazon | 218.487173 |
| 90 | 2020-12-01 14:53:28.377359 | Music | Amazon | 101.573276 |
Find total expense for Books done on Amazon
wallet.groupby(["category", "description"]).sum()
/tmp/ipykernel_37473/4010557707.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. wallet.groupby(["category", "description"]).sum()
| debit | ||
|---|---|---|
| category | description | |
| Books | Amazon | 1037.442802 |
| Amazon Kindle | 1389.052376 | |
| Flipcart | 2503.255216 | |
| Food | Hotel | 2752.174732 |
| Swiggy | 1936.495366 | |
| Zomato | 3592.519075 | |
| Music | Amazon | 1467.247766 |
| Netflix | 1546.567562 | |
| spotify | 1219.636541 | |
| Travel | Auto | 2210.428935 |
| Metro | 1216.463665 | |
| Taxi | 2626.039276 | |
| Utility | Electricity | 2885.064355 |
| Phone | 4677.202878 |
df = wallet.groupby(["category", "description"]).sum()
/tmp/ipykernel_37473/807199200.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. df = wallet.groupby(["category", "description"]).sum()
df
| debit | ||
|---|---|---|
| category | description | |
| Books | Amazon | 1037.442802 |
| Amazon Kindle | 1389.052376 | |
| Flipcart | 2503.255216 | |
| Food | Hotel | 2752.174732 |
| Swiggy | 1936.495366 | |
| Zomato | 3592.519075 | |
| Music | Amazon | 1467.247766 |
| Netflix | 1546.567562 | |
| spotify | 1219.636541 | |
| Travel | Auto | 2210.428935 |
| Metro | 1216.463665 | |
| Taxi | 2626.039276 | |
| Utility | Electricity | 2885.064355 |
| Phone | 4677.202878 |
df.loc[('Books','Amazon')]
debit 1037.442802 Name: (Books, Amazon), dtype: float64
s1
a 2 b 3 c 4 d 5 dtype: int64
s2
e 1 d 2 c 3 b 4 a 5 dtype: int64
s1 + s2
a 7.0 b 7.0 c 7.0 d 7.0 e NaN dtype: float64
[1, 1, 1] + [1, 1, 1]
[1, 1, 1, 1, 1, 1]
pd.concat([s1, s2])
a 2 b 3 c 4 d 5 e 1 d 2 c 3 b 4 a 5 dtype: int64
stocks
APPLE 420 AT&T 320 IBM 120 NIKE 220 dtype: int64
stocksdf
| value | high | low | volume | total_asset | |
|---|---|---|---|---|---|
| APPLE | 420.0 | 423.0 | 419.0 | 100 | 42000.0 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 | 64100.0 |
| IBM | 120.1 | 125.0 | 115.0 | 50 | 6005.0 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 | 220000.0 |
labels = ["APPLE","AT&T","IBM","NIKE"]
value = pd.Series([234.5, 221.6, 125.7, 100.5], index=labels)
high = pd.Series([240.32, 222.5, 127.3, 105.0], index=labels)
low = pd.Series([233.0, 220.0, 123.0, 104.0], index=labels)
volume = pd.Series([100, 200, 50, 1000], index=labels)
stock1 = pd.DataFrame({"value":value, "high":high, "low":low, "volume":volume})
stocks2 = pd.DataFrame({
"value" : [125, 500.0, 300.4, 423.9],
"low" : [125.0, 490.0, 299.5, 421.1],
"high" : [130.0, 500.0, 305.0, 425.5],
"volume" : [123, 50, 100, 80]
},
index = ["BELL","XEROX","FORD","TESLA"]
)
stock1
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 234.5 | 240.32 | 233.0 | 100 |
| AT&T | 221.6 | 222.50 | 220.0 | 200 |
| IBM | 125.7 | 127.30 | 123.0 | 50 |
| NIKE | 100.5 | 105.00 | 104.0 | 1000 |
stocks2
| value | low | high | volume | |
|---|---|---|---|---|
| BELL | 125.0 | 125.0 | 130.0 | 123 |
| XEROX | 500.0 | 490.0 | 500.0 | 50 |
| FORD | 300.4 | 299.5 | 305.0 | 100 |
| TESLA | 423.9 | 421.1 | 425.5 | 80 |
pd.concat([stock1, stocks2])
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 234.5 | 240.32 | 233.0 | 100 |
| AT&T | 221.6 | 222.50 | 220.0 | 200 |
| IBM | 125.7 | 127.30 | 123.0 | 50 |
| NIKE | 100.5 | 105.00 | 104.0 | 1000 |
| BELL | 125.0 | 130.00 | 125.0 | 123 |
| XEROX | 500.0 | 500.00 | 490.0 | 50 |
| FORD | 300.4 | 305.00 | 299.5 | 100 |
| TESLA | 423.9 | 425.50 | 421.1 | 80 |
labels = ["APPLE","AT&T","IBM","NIKE"]
value = pd.Series([234.5, 221.6, 125.7, 100.5], index=labels)
high = pd.Series([240.32, 222.5, 127.3, 105.0], index=labels)
low = pd.Series([233.0, 220.0, 123.0, 104.0], index=labels)
volume = pd.Series([100, 200, 50, 1000], index=labels)
stocks1 = pd.DataFrame({"value":value, "high":high, "low":low, "volume":volume})
stocks2 = pd.DataFrame({
"value" : [125, 500.0, 300.4, 423.9],
"volume" : [123, 50, 100, 80],
"low" : [125.0, 490.0, 299.5, 421.1],
"high" : [130.0, 500.0, 305.0, 425.5]
},
index = ["BELL","XEROX","FORD","TESLA"]
)
stocks1
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 234.5 | 240.32 | 233.0 | 100 |
| AT&T | 221.6 | 222.50 | 220.0 | 200 |
| IBM | 125.7 | 127.30 | 123.0 | 50 |
| NIKE | 100.5 | 105.00 | 104.0 | 1000 |
stocks2
| value | volume | low | high | |
|---|---|---|---|---|
| BELL | 125.0 | 123 | 125.0 | 130.0 |
| XEROX | 500.0 | 50 | 490.0 | 500.0 |
| FORD | 300.4 | 100 | 299.5 | 305.0 |
| TESLA | 423.9 | 80 | 421.1 | 425.5 |
pd.concat([stocks1, stocks2])
| value | high | low | volume | |
|---|---|---|---|---|
| APPLE | 234.5 | 240.32 | 233.0 | 100 |
| AT&T | 221.6 | 222.50 | 220.0 | 200 |
| IBM | 125.7 | 127.30 | 123.0 | 50 |
| NIKE | 100.5 | 105.00 | 104.0 | 1000 |
| BELL | 125.0 | 130.00 | 125.0 | 123 |
| XEROX | 500.0 | 500.00 | 490.0 | 50 |
| FORD | 300.4 | 305.00 | 299.5 | 100 |
| TESLA | 423.9 | 425.50 | 421.1 | 80 |
!cat stocks.csv
,value,high,low,volume APPLE,420.0,423.0,419.0,100 AT&T,320.5,322.0,318.0,200 IBM,120.1,125.0,115.0,50 NIKE,220.0,227.0,217.4,1000
%%file stocks_data1.csv
ticker,value,high
APPLE,420.0,423.0
AT&T,320.5,322.0
IBM,120.1,125.0
NIKE,220.0,227.0
Overwriting stocks_data1.csv
%%file stocks_data2.csv
ticker,low,volume
APPLE,419.0,100
AT&T,318.0,200
IBM,115.0,50
NIKE,217.4,1000
Overwriting stocks_data2.csv
data1 = pd.read_csv("stocks_data1.csv", index_col="ticker")
data2 = pd.read_csv("stocks_data2.csv", index_col="ticker")
data1
| value | high | |
|---|---|---|
| ticker | ||
| APPLE | 420.0 | 423.0 |
| AT&T | 320.5 | 322.0 |
| IBM | 120.1 | 125.0 |
| NIKE | 220.0 | 227.0 |
data2
| low | volume | |
|---|---|---|
| ticker | ||
| APPLE | 419.0 | 100 |
| AT&T | 318.0 | 200 |
| IBM | 115.0 | 50 |
| NIKE | 217.4 | 1000 |
data1.join(data2) # just like SQL ..this needs same index on both dataframes
| value | high | low | volume | |
|---|---|---|---|---|
| ticker | ||||
| APPLE | 420.0 | 423.0 | 419.0 | 100 |
| AT&T | 320.5 | 322.0 | 318.0 | 200 |
| IBM | 120.1 | 125.0 | 115.0 | 50 |
| NIKE | 220.0 | 227.0 | 217.4 | 1000 |
df1 = pd.read_csv("stocks_data1.csv")
df2 = pd.read_csv("stocks_data2.csv")
df1
| ticker | value | high | |
|---|---|---|---|
| 0 | APPLE | 420.0 | 423.0 |
| 1 | AT&T | 320.5 | 322.0 |
| 2 | IBM | 120.1 | 125.0 |
| 3 | NIKE | 220.0 | 227.0 |
df2
| ticker | low | volume | |
|---|---|---|---|
| 0 | APPLE | 419.0 | 100 |
| 1 | AT&T | 318.0 | 200 |
| 2 | IBM | 115.0 | 50 |
| 3 | NIKE | 217.4 | 1000 |
pd.merge(df1, df2, on="ticker")
| ticker | value | high | low | volume | |
|---|---|---|---|---|---|
| 0 | APPLE | 420.0 | 423.0 | 419.0 | 100 |
| 1 | AT&T | 320.5 | 322.0 | 318.0 | 200 |
| 2 | IBM | 120.1 | 125.0 | 115.0 | 50 |
| 3 | NIKE | 220.0 | 227.0 | 217.4 | 1000 |
wallet.to_csv("data.csv")