Module 3 - Day 4

Login to Lab using your credentials. There is a notebook with name 3-4.ipynb already created for you. Open that and use it for today’s training.

Shut down all previous notebooks.

You can access live notes from https://live.arcesium-lab.pipal.in

Reading PDF files from python

we will make use PyPDF2 library for reading pdf files. you can install PyPDF2 using

pip install PyPDF2

pdfurl = "https://posoco.in/download/16-07-20_nldc_psp/?wpdmdl=30215"

import requests

def download(url, filename):
    r = requests.get(url)
    if r.status_code == 200:
        with open(filename, "wb") as f:
            f.write(r.content)
    else:
        print(f"Error in downloading {filename}")

download(pdfurl, "powerdata.pdf")

import PyPDF2

with open("powerdata.pdf") as f:
    pdfreader = PyPDF2.PdfFileReader(f)

---------------------------------------------------------------------------
DeprecationError                          Traceback (most recent call last)
Cell In[4], line 2
      1 with open("powerdata.pdf") as f:
----> 2     pdfreader = PyPDF2.PdfFileReader(f)

File /opt/tljh/user/lib/python3.10/site-packages/PyPDF2/_reader.py:1974, in PdfFileReader.__init__(self, *args, **kwargs)
   1973 def __init__(self, *args: Any, **kwargs: Any) -> None:
-> 1974     deprecation_with_replacement("PdfFileReader", "PdfReader", "3.0.0")
   1975     if "strict" not in kwargs and len(args) < 2:
   1976         kwargs["strict"] = True  # maintain the default

File /opt/tljh/user/lib/python3.10/site-packages/PyPDF2/_utils.py:369, in deprecation_with_replacement(old_name, new_name, removed_in)
    363 def deprecation_with_replacement(
    364     old_name: str, new_name: str, removed_in: str = "3.0.0"
    365 ) -> None:
    366     """
    367     Raise an exception that a feature was already removed, but has a replacement.
    368     """
--> 369     deprecation(DEPR_MSG_HAPPENED.format(old_name, removed_in, new_name))

File /opt/tljh/user/lib/python3.10/site-packages/PyPDF2/_utils.py:351, in deprecation(msg)
    350 def deprecation(msg: str) -> None:
--> 351     raise DeprecationError(msg)

DeprecationError: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead.

with open("powerdata.pdf") as f:
    pdfreader = PyPDF2.PdfReader(f)

PdfReader stream/file object is not in binary mode. It may not be read correctly.

---------------------------------------------------------------------------
UnsupportedOperation                      Traceback (most recent call last)
Cell In[5], line 2
      1 with open("powerdata.pdf") as f:
----> 2     pdfreader = PyPDF2.PdfReader(f)

File /opt/tljh/user/lib/python3.10/site-packages/PyPDF2/_reader.py:319, in PdfReader.__init__(self, stream, strict, password)
    317     with open(stream, "rb") as fh:
    318         stream = BytesIO(fh.read())
--> 319 self.read(stream)
    320 self.stream = stream
    322 self._override_encryption = False

File /opt/tljh/user/lib/python3.10/site-packages/PyPDF2/_reader.py:1415, in PdfReader.read(self, stream)
   1413 def read(self, stream: StreamType) -> None:
   1414     self._basic_validation(stream)
-> 1415     self._find_eof_marker(stream)
   1416     startxref = self._find_startxref_pos(stream)
   1418     # check and eventually correct the startxref only in not strict

File /opt/tljh/user/lib/python3.10/site-packages/PyPDF2/_reader.py:1472, in PdfReader._find_eof_marker(self, stream)
   1470 if stream.tell() < last_mb:
   1471     raise PdfReadError("EOF marker not found")
-> 1472 line = read_previous_line(stream)

File /opt/tljh/user/lib/python3.10/site-packages/PyPDF2/_utils.py:207, in read_previous_line(stream)
    204     break
    205 # Read the block. After this, our stream will be one
    206 # beyond the initial position.
--> 207 block = read_block_backwards(stream, to_read)
    208 idx = len(block) - 1
    209 if not found_crlf:
    210     # We haven't found our first CR/LF yet.
    211     # Read off characters until we hit one.

File /opt/tljh/user/lib/python3.10/site-packages/PyPDF2/_utils.py:180, in read_block_backwards(stream, to_read)
    178     raise PdfStreamError("Could not read malformed PDF file")
    179 # Seek to the start of the block we want to read.
--> 180 stream.seek(-to_read, SEEK_CUR)
    181 read = stream.read(to_read)
    182 # Seek to the start of the block we read after reading it.

UnsupportedOperation: can't do nonzero cur-relative seeks

help(PyPDF2.PdfReader)

Help on class PdfReader in module PyPDF2._reader:

class PdfReader(builtins.object)
 |  PdfReader(stream: Union[str, IO, pathlib.Path], strict: bool = False, password: Union[NoneType, str, bytes] = None) -> None
 |  
 |  Initialize a PdfReader object.
 |  
 |  This operation can take some time, as the PDF stream's cross-reference
 |  tables are read into memory.
 |  
 |  :param stream: A File object or an object that supports the standard read
 |      and seek methods similar to a File object. Could also be a
 |      string representing a path to a PDF file.
 |  :param bool strict: Determines whether user should be warned of all
 |      problems and also causes some correctable problems to be fatal.
 |      Defaults to ``False``.
 |  :param None/str/bytes password: Decrypt PDF file at initialization. If the
 |      password is None, the file will not be decrypted.
 |      Defaults to ``None``
 |  
 |  Methods defined here:
 |  
 |  __init__(self, stream: Union[str, IO, pathlib.Path], strict: bool = False, password: Union[NoneType, str, bytes] = None) -> None
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  cacheGetIndirectObject(self, generation: int, idnum: int) -> Optional[PyPDF2.generic._base.PdfObject]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :meth:`cache_get_indirect_object` instead.
 |  
 |  cacheIndirectObject(self, generation: int, idnum: int, obj: Optional[PyPDF2.generic._base.PdfObject]) -> Optional[PyPDF2.generic._base.PdfObject]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :meth:`cache_indirect_object` instead.
 |  
 |  cache_get_indirect_object(self, generation: int, idnum: int) -> Optional[PyPDF2.generic._base.PdfObject]
 |  
 |  cache_indirect_object(self, generation: int, idnum: int, obj: Optional[PyPDF2.generic._base.PdfObject]) -> Optional[PyPDF2.generic._base.PdfObject]
 |  
 |  decode_permissions(self, permissions_code: int) -> Dict[str, bool]
 |  
 |  decrypt(self, password: Union[str, bytes]) -> PyPDF2._encryption.PasswordType
 |      When using an encrypted / secured PDF file with the PDF Standard
 |      encryption handler, this function will allow the file to be decrypted.
 |      It checks the given password against the document's user password and
 |      owner password, and then stores the resulting decryption key if either
 |      password is correct.
 |      
 |      It does not matter which password was matched.  Both passwords provide
 |      the correct decryption key that will allow the document to be used with
 |      this library.
 |      
 |      :param str password: The password to match.
 |      :return: `PasswordType`.
 |  
 |  getDestinationPageNumber(self, destination: PyPDF2.generic._data_structures.Destination) -> int
 |      .. deprecated:: 1.28.0
 |      
 |          Use :meth:`get_destination_page_number` instead.
 |  
 |  getDocumentInfo(self) -> Optional[PyPDF2._reader.DocumentInformation]
 |      .. deprecated:: 1.28.0
 |      
 |          Use the attribute :py:attr:`metadata` instead.
 |  
 |  getFields(self, tree: Optional[PyPDF2.generic._data_structures.TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None) -> Optional[Dict[str, Any]]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :meth:`get_fields` instead.
 |  
 |  getFormTextFields(self) -> Dict[str, Any]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :meth:`get_form_text_fields` instead.
 |  
 |  getIsEncrypted(self) -> bool
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`is_encrypted` instead.
 |  
 |  getNamedDestinations(self, tree: Optional[PyPDF2.generic._data_structures.TreeObject] = None, retval: Optional[Any] = None) -> Dict[str, Any]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`named_destinations` instead.
 |  
 |  getNumPages(self) -> int
 |      .. deprecated:: 1.28.0
 |      
 |          Use :code:`len(reader.pages)` instead.
 |  
 |  getObject(self, indirectReference: PyPDF2.generic._base.IndirectObject) -> Optional[PyPDF2.generic._base.PdfObject]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :meth:`get_object` instead.
 |  
 |  getOutlines(self, node: Optional[PyPDF2.generic._data_structures.DictionaryObject] = None, outline: Optional[Any] = None) -> List[Union[PyPDF2.generic._data_structures.Destination, List[Union[PyPDF2.generic._data_structures.Destination, List[PyPDF2.generic._data_structures.Destination]]]]]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`outline` instead.
 |  
 |  getPage(self, pageNumber: int) -> PyPDF2._page.PageObject
 |      .. deprecated:: 1.28.0
 |      
 |          Use :code:`reader.pages[page_number]` instead.
 |  
 |  getPageLayout(self) -> Optional[str]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`page_layout` instead.
 |  
 |  getPageMode(self) -> Optional[Literal['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments']]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`page_mode` instead.
 |  
 |  getPageNumber(self, page: PyPDF2._page.PageObject) -> int
 |      .. deprecated:: 1.28.0
 |      
 |          Use :meth:`get_page_number` instead.
 |  
 |  getXmpMetadata(self) -> Optional[PyPDF2.xmp.XmpInformation]
 |      .. deprecated:: 1.28.0
 |      
 |          Use the attribute :py:attr:`xmp_metadata` instead.
 |  
 |  get_destination_page_number(self, destination: PyPDF2.generic._data_structures.Destination) -> int
 |      Retrieve page number of a given Destination object.
 |      
 |      :param Destination destination: The destination to get page number.
 |      :return: the page number or -1 if page not found
 |  
 |  get_fields(self, tree: Optional[PyPDF2.generic._data_structures.TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None) -> Optional[Dict[str, Any]]
 |      Extract field data if this PDF contains interactive form fields.
 |      
 |      The *tree* and *retval* parameters are for recursive use.
 |      
 |      :param fileobj: A file object (usually a text file) to write
 |          a report to on all interactive form fields found.
 |      :return: A dictionary where each key is a field name, and each
 |          value is a :class:`Field<PyPDF2.generic.Field>` object. By
 |          default, the mapping name is used for keys.
 |          ``None`` if form data could not be located.
 |  
 |  get_form_text_fields(self) -> Dict[str, Any]
 |      Retrieve form fields from the document with textual data.
 |      
 |      The key is the name of the form field, the value is the content of the
 |      field.
 |      
 |      If the document contains multiple form fields with the same name, the
 |      second and following will get the suffix _2, _3, ...
 |  
 |  get_object(self, indirect_reference: Union[int, PyPDF2.generic._base.IndirectObject]) -> Optional[PyPDF2.generic._base.PdfObject]
 |  
 |  get_page_number(self, page: PyPDF2._page.PageObject) -> int
 |      Retrieve page number of a given PageObject
 |      
 |      :param PageObject page: The page to get page number. Should be
 |          an instance of :class:`PageObject<PyPDF2._page.PageObject>`
 |      :return: the page number or -1 if page not found
 |  
 |  read(self, stream: <class 'IO'>) -> None
 |  
 |  readNextEndLine(self, stream: <class 'IO'>, limit_offset: int = 0) -> bytes
 |      .. deprecated:: 1.28.0
 |  
 |  readObjectHeader(self, stream: <class 'IO'>) -> Tuple[int, int]
 |      .. deprecated:: 1.28.0
 |      
 |          Use :meth:`read_object_header` instead.
 |  
 |  read_next_end_line(self, stream: <class 'IO'>, limit_offset: int = 0) -> bytes
 |      .. deprecated:: 2.1.0
 |  
 |  read_object_header(self, stream: <class 'IO'>) -> Tuple[int, int]
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties defined here:
 |  
 |  documentInfo
 |      .. deprecated:: 1.28.0
 |      
 |          Use the attribute :py:attr:`metadata` instead.
 |  
 |  isEncrypted
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`is_encrypted` instead.
 |  
 |  is_encrypted
 |      Read-only boolean property showing whether this PDF file is encrypted.
 |      Note that this property, if true, will remain true even after the
 |      :meth:`decrypt()<PyPDF2.PdfReader.decrypt>` method is called.
 |  
 |  metadata
 |      Retrieve the PDF file's document information dictionary, if it exists.
 |      Note that some PDF files use metadata streams instead of docinfo
 |      dictionaries, and these metadata streams will not be accessed by this
 |      function.
 |      
 |      :return: the document information of this PDF file
 |  
 |  namedDestinations
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`named_destinations` instead.
 |  
 |  named_destinations
 |      A read-only dictionary which maps names to
 |      :class:`Destinations<PyPDF2.generic.Destination>`
 |  
 |  numPages
 |      .. deprecated:: 1.28.0
 |      
 |          Use :code:`len(reader.pages)` instead.
 |  
 |  outline
 |      Read-only property for the outline (i.e., a collection of 'outline items'
 |      which are also known as 'bookmarks') present in the document.
 |      
 |      :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`.
 |  
 |  outlines
 |      .. deprecated:: 2.9.0
 |      
 |          Use :py:attr:`outline` instead.
 |  
 |  pageLayout
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`page_layout` instead.
 |  
 |  pageMode
 |      .. deprecated:: 1.28.0
 |      
 |          Use :py:attr:`page_mode` instead.
 |  
 |  page_layout
 |      Get the page layout.
 |      
 |      :return: Page layout currently being used.
 |      
 |      .. list-table:: Valid ``layout`` values
 |         :widths: 50 200
 |      
 |         * - /NoLayout
 |           - Layout explicitly not specified
 |         * - /SinglePage
 |           - Show one page at a time
 |         * - /OneColumn
 |           - Show one column at a time
 |         * - /TwoColumnLeft
 |           - Show pages in two columns, odd-numbered pages on the left
 |         * - /TwoColumnRight
 |           - Show pages in two columns, odd-numbered pages on the right
 |         * - /TwoPageLeft
 |           - Show two pages at a time, odd-numbered pages on the left
 |         * - /TwoPageRight
 |           - Show two pages at a time, odd-numbered pages on the right
 |  
 |  page_mode
 |      Get the page mode.
 |      
 |      :return: Page mode currently being used.
 |      
 |      .. list-table:: Valid ``mode`` values
 |         :widths: 50 200
 |      
 |         * - /UseNone
 |           - Do not show outline or thumbnails panels
 |         * - /UseOutlines
 |           - Show outline (aka bookmarks) panel
 |         * - /UseThumbs
 |           - Show page thumbnails panel
 |         * - /FullScreen
 |           - Fullscreen view
 |         * - /UseOC
 |           - Show Optional Content Group (OCG) panel
 |         * - /UseAttachments
 |           - Show attachments panel
 |  
 |  pages
 |      Read-only property that emulates a list of :py:class:`Page<PyPDF2._page.Page>` objects.
 |  
 |  pdf_header
 |  
 |  threads
 |      Read-only property for the list of threads see §8.3.2 from PDF 1.7 spec
 |      
 |      :return: an Array of Dictionnaries with "/F" and "/I" properties
 |               or None if no articles.
 |  
 |  xfa
 |  
 |  xmpMetadata
 |      .. deprecated:: 1.28.0
 |      
 |          Use the attribute :py:attr:`xmp_metadata` instead.
 |  
 |  xmp_metadata
 |      XMP (Extensible Metadata Platform) data
 |      
 |      :return: a :class:`XmpInformation<xmp.XmpInformation>`
 |          instance that can be used to access XMP metadata from the document.
 |          or ``None`` if no metadata was found on the document root.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)

PyPDF2.PdfReader("powerdata.pdf")

<PyPDF2._reader.PdfReader at 0x7f8d5d4a48b0>

reader = PyPDF2.PdfReader("powerdata.pdf")

reader.pages[0].extract_text()

' \nNational Load Despatch Centre  \nराष्ट्रीय भार प्रेषण केंद्र \nPOWER SYSTEM OPERATION CORPORATION LIMITED  \nपॉवर सिस्टम ऑपरेशन कारपोरेशन  सिसमटेड  \n (Government of India Enterprise / भारत िरकार का उद्यम ) \nB-9, QUTU B INSTITUTIONAL AREA, KATWARIA SARAI,  NEW DELHI -110016  \nबी-9, क़ुतुब इन्स्टीट्यूशनि एररया, कटवाररया िराये , न्यू सिल्ली -110016  \n_____________________________________________________________________________________________________________________________ __________  \nRef: POSOCO/NLDC/SO/Daily PSP  Report                     दिन ांक : 16th Jul 2020 \n \nTo, \n \n \n1. क र्यक री  दनिेशक, पू .क्षे .भ  .प्रे .के., 14, गोल्फ क्लब रोड , कोलक त  - 700033 \nExecutive Director , ERLDC, 14 Golf Club Road, Toll ygunge, Kolkata, 700033  \n2. क र्यक री  दनिेशक, ऊ. क्षे. भ . प्रे. के., 18/ ए , शहीि जीत दसांह सनसनव ल म गय , नई दिल्ली – 110016 \nExecutive Director , NRLDC, 18 -A, Shaheed Jeet Singh Marg, Katwaria Sarai, New Delhi – 110016  \n3. क र्यक री  दनिेशक, प .क्षे .भ  .प्रे .के., एफ-3, एम आई डी सी क्षेत्र , अांधेरी, म ांबई – 400093 \nExecutive Director , WRLDC, F -3, M.I.D.C. Area, Marol, Andheri  (East), Mumbai -400093  \n4. क र्यक री दनिेशक, ऊ. पू. क्षे. भ . प्रे. के., डोांगदतएह, लोअर नोांग्रह , ल पलांग, दशलोांग – 793006 \nExecutive Director , NERLDC, Dongteih, Lower Nongrah, Lapalang, Shillong - 793006, Meghalaya  \n5. क र्यक री  दनिेशक , ि .क्षे .भ  .प्रे .के., 29, रेस कोसय क्रॉस रोड, बांगल रु – 560009 \nExecutive Director , SRLDC, 29, Race Course Cross Road,  Bangalore -560009  \n \nSub: Daily PSP  Report  for the date 15.07.2020. \n \nमहोिर्/Dear Sir,  \n \nआई०ई०जी०सी०-2010 की ध र  स.-5.5.1 के प्र वध न के अन स र, दिन ांक 15-ज ल ई-2020 की अखिल \nभ रतीर् प्रण ली की िैदनक दग्रड दनष्प िन ररपोर्य र ०भ ०प्रे०के० की वेबस इर् पर उप्लब्ध है  |  \n \nAs per article 5.5.1 of the Indian Electricity Grid Code, the daily report pertaini ng power supply position of All  India \nPower System for the date 15th July 2020, is available at the NLDC website .  \n \n \n \n \n \n \nधन्यव ि, \n \n \n'

def extractPage(filename, pagenumber):
    pdfreader = PyPDF2.PdfReader(filename)
    return pdfreader.pages[pagenumber].extract_text()

extractPage("powerdata.pdf", 1)

'NR WR SR ER NER TOTAL\n59882 41115 34238 21526 2730 159491\n1114 0 0 0 6 1120\n1398 998 807 447 48 3698\n355 33 77 149 29 643\n11 49 128 - - 187\n39.60 16.60 41.59 4.60 0.03 102\n12.6 0.0 0.0 0.0 0.0 12.6\n65470 43593 38117 21535 2827 160654\n22:20 10:29 10:00 21:20 19:41 21:26\nRegion FVI < 49.7 49.7 - 49.8 49.8 - 49.9 < 49.9 49.9 - 50.05 > 50.05\nAll India 0.057 0.16 1.81 13.19 15.16 76.52 8.32\nMax.Demand Shortage during Energy Met Drawal OD(+)/UD(-) Max OD Energy\nRegion States Met during the \nday(MW)maximum \nDemand(MW)(MU)Schedule\n(MU)(MU) (MW)Shortage \n(MU)\nPunjab 11090 0 237.9 146.8 -1.8 49 0.0\nHaryana 9388 0 209.4 152.8 0.7 325 1.9\nRajasthan 12087 0 262.4 119.7 5.4 809 0.0\nDelhi 5726 0 118.6 102.8 -1.4 228 0.0\nNR UP 22873 0 448.9 208.5 2.0 546 0.4\nUttarakhand 1899 0 42.8 20.7 0.8 111 0.0\nHP 1366 0 28.6 -2.6 -0.2 91 0.0\nJ&K(UT) & Ladakh(UT) 2177 544 43.1 20.3 0.4 502 10.3\nChandigarh 295 0 6.0 5.9 0.2 61 0.0\nChhattisgarh 3685 0 86.9 36.8 0.8 468 0.0\nGujarat 13478 0 286.2 87.6 4.0 527 0.0\nMP 9547 0 214.7 113.8 -3.8 198 0.0\nWR Maharashtra 16964 0 365.1 138.1 -1.9 457 0.0\nGoa 405 0 8.5 8.2 -0.2 33 0.0\nDD 246 0 5.3 5.3 0.0 19 0.0\nDNH 614 0 14.0 13.8 0.2 44 0.0\nAMNSIL 777 0 17.1 4.2 0.7 272 0.0\nAndhra Pradesh 6439 0 141.0 45.6 -1.3 607 0.0\nTelangana 8614 0 167.3 81.6 -2.5 385 0.0\nSR Karnataka 8486 0 155.1 51.1 -3.4 650 0.0\nKerala 3077 0 65.2 46.1 0.5 179 0.0\nTamil Nadu 12371 0 271.3 125.9 -3.7 573 0.0\nPuducherry 349 0 7.5 7.5 -0.1 35 0.0\nBihar 5740 0 111.5 106.0 -0.3 386 0.0\nDVC 2989 0 62.7 -42.6 -0.7 206 0.0\nJharkhand 1438 0 26.3 18.5 -1.0 124 0.0\nER Odisha 3983 0 82.2 -0.2 -0.2 325 0.0\nWest Bengal 7917 0 162.6 47.2 -0.8 303 0.0\nSikkim 100 0 1.4 1.5 -0.1 17 0.0\nArunachal Pradesh 120 3 2.0 1.8 0.2 40 0.0\nAssam 1759 23 30.0 27.1 -0.1 135 0.0\nManipur 183 1 2.6 2.3 0.3 37 0.0\nNER Meghalaya 307 2 5.3 -1.3 0.3 52 0.0\nMizoram 89 1 1.5 1.2 0.0 13 0.0\nNagaland 140 2 2.2 2.3 -0.2 23 0.0\nTripura 298 7 4.9 5.9 0.7 66 0.0\nBhutan Nepal Bangladesh\n53.3 -1.5 -19.1\n2337.0 -271.3 -1110.0\nNR WR SR ER NER TOTAL\n352.1 -295.4 95.0 -145.8 -6.0 0.0\n359.2 -293.7 84.6 -152.6 -3.4 -6.0\n7.1 1.6 -10.5 -6.9 2.6 -6.0\nNR WR SR ER NER TOTAL\n3838 14847 11792 3445 677 34598\n9289 23225 14423 4892 47 51876\n13127 38072 26215 8337 723 86473\nNR WR SR ER NER All India\n546 1080 370 482 7 2486\n25 13 14 0 0 52\n355 33 77 149 29 643\n26 33 47 0 0 106\n40 82 19 0 22 163\n71 73 210 5 0 359\n1063 1314 737 636 58 3809\n6.71 5.54 28.51 0.73 0.05 9.43\n42.55 10.54 45.35 24.19 49.63 29.09\n1.068\n1.102 Based on State Max Demands\nDiversity factor = Sum of regional or state maximum demands / All India maximum demand\n*Source: RLDCs for solar connected to ISTS; SLDCs for embedded solar. Limited visibility of embedded solar data.\nExecutive Director-NLDCShare of RES in total generation (%)\nShare of Non-fossil fuel (Hydro,Nuclear and RES) in total generation(%)\nH. All India Demand Diversity Factor\nBased on Regional Max DemandsLignite\nHydro\nNuclear\nGas, Naptha & Diesel\nRES (Wind, Solar, Biomass & Others)\nTotalState Sector\nTotal\nG. Sourcewise generation (MU)\nCoalActual(MU)\nO/D/U/D(MU)\nF. Generation Outage(MW)\nCentral SectorDay Peak (MW)\nE. Import/Export by Regions (in MU) - Import(+ve)/Export(-ve); OD(+)/UD(-)\nSchedule(MU)D. Transnational Exchanges (MU) - Import(+ve)/Export(-ve)\xa0\xa0\xa0\nActual (MU)Energy Shortage (MU)\nMaximum Demand Met During the Day (MW) (From NLDC SCADA)\nTime Of Maximum Demand Met (From NLDC SCADA)\nB. Frequency Profile (%)\nC. Power Supply Position in StatesDemand Met during Evening Peak hrs(MW) (at 2000 hrs; from RLDCs)\nPeak Shortage (MW)\nEnergy Met (MU)\nHydro Gen (MU)\nWind Gen (MU)\nSolar Gen (MU)*Report for previous day Date of Reporting: 16-Jul-2020\nA. Power Supply Position at All India and Regional level\n'

print(extractPage("powerdata.pdf", 1))

NR WR SR ER NER TOTAL
59882 41115 34238 21526 2730 159491
1114 0 0 0 6 1120
1398 998 807 447 48 3698
355 33 77 149 29 643
11 49 128 - - 187
39.60 16.60 41.59 4.60 0.03 102
12.6 0.0 0.0 0.0 0.0 12.6
65470 43593 38117 21535 2827 160654
22:20 10:29 10:00 21:20 19:41 21:26
Region FVI < 49.7 49.7 - 49.8 49.8 - 49.9 < 49.9 49.9 - 50.05 > 50.05
All India 0.057 0.16 1.81 13.19 15.16 76.52 8.32
Max.Demand Shortage during Energy Met Drawal OD(+)/UD(-) Max OD Energy
Region States Met during the 
day(MW)maximum 
Demand(MW)(MU)Schedule
(MU)(MU) (MW)Shortage 
(MU)
Punjab 11090 0 237.9 146.8 -1.8 49 0.0
Haryana 9388 0 209.4 152.8 0.7 325 1.9
Rajasthan 12087 0 262.4 119.7 5.4 809 0.0
Delhi 5726 0 118.6 102.8 -1.4 228 0.0
NR UP 22873 0 448.9 208.5 2.0 546 0.4
Uttarakhand 1899 0 42.8 20.7 0.8 111 0.0
HP 1366 0 28.6 -2.6 -0.2 91 0.0
J&K(UT) & Ladakh(UT) 2177 544 43.1 20.3 0.4 502 10.3
Chandigarh 295 0 6.0 5.9 0.2 61 0.0
Chhattisgarh 3685 0 86.9 36.8 0.8 468 0.0
Gujarat 13478 0 286.2 87.6 4.0 527 0.0
MP 9547 0 214.7 113.8 -3.8 198 0.0
WR Maharashtra 16964 0 365.1 138.1 -1.9 457 0.0
Goa 405 0 8.5 8.2 -0.2 33 0.0
DD 246 0 5.3 5.3 0.0 19 0.0
DNH 614 0 14.0 13.8 0.2 44 0.0
AMNSIL 777 0 17.1 4.2 0.7 272 0.0
Andhra Pradesh 6439 0 141.0 45.6 -1.3 607 0.0
Telangana 8614 0 167.3 81.6 -2.5 385 0.0
SR Karnataka 8486 0 155.1 51.1 -3.4 650 0.0
Kerala 3077 0 65.2 46.1 0.5 179 0.0
Tamil Nadu 12371 0 271.3 125.9 -3.7 573 0.0
Puducherry 349 0 7.5 7.5 -0.1 35 0.0
Bihar 5740 0 111.5 106.0 -0.3 386 0.0
DVC 2989 0 62.7 -42.6 -0.7 206 0.0
Jharkhand 1438 0 26.3 18.5 -1.0 124 0.0
ER Odisha 3983 0 82.2 -0.2 -0.2 325 0.0
West Bengal 7917 0 162.6 47.2 -0.8 303 0.0
Sikkim 100 0 1.4 1.5 -0.1 17 0.0
Arunachal Pradesh 120 3 2.0 1.8 0.2 40 0.0
Assam 1759 23 30.0 27.1 -0.1 135 0.0
Manipur 183 1 2.6 2.3 0.3 37 0.0
NER Meghalaya 307 2 5.3 -1.3 0.3 52 0.0
Mizoram 89 1 1.5 1.2 0.0 13 0.0
Nagaland 140 2 2.2 2.3 -0.2 23 0.0
Tripura 298 7 4.9 5.9 0.7 66 0.0
Bhutan Nepal Bangladesh
53.3 -1.5 -19.1
2337.0 -271.3 -1110.0
NR WR SR ER NER TOTAL
352.1 -295.4 95.0 -145.8 -6.0 0.0
359.2 -293.7 84.6 -152.6 -3.4 -6.0
7.1 1.6 -10.5 -6.9 2.6 -6.0
NR WR SR ER NER TOTAL
3838 14847 11792 3445 677 34598
9289 23225 14423 4892 47 51876
13127 38072 26215 8337 723 86473
NR WR SR ER NER All India
546 1080 370 482 7 2486
25 13 14 0 0 52
355 33 77 149 29 643
26 33 47 0 0 106
40 82 19 0 22 163
71 73 210 5 0 359
1063 1314 737 636 58 3809
6.71 5.54 28.51 0.73 0.05 9.43
42.55 10.54 45.35 24.19 49.63 29.09
1.068
1.102 Based on State Max Demands
Diversity factor = Sum of regional or state maximum demands / All India maximum demand
*Source: RLDCs for solar connected to ISTS; SLDCs for embedded solar. Limited visibility of embedded solar data.
Executive Director-NLDCShare of RES in total generation (%)
Share of Non-fossil fuel (Hydro,Nuclear and RES) in total generation(%)
H. All India Demand Diversity Factor
Based on Regional Max DemandsLignite
Hydro
Nuclear
Gas, Naptha & Diesel
RES (Wind, Solar, Biomass & Others)
TotalState Sector
Total
G. Sourcewise generation (MU)
CoalActual(MU)
O/D/U/D(MU)
F. Generation Outage(MW)
Central SectorDay Peak (MW)
E. Import/Export by Regions (in MU) - Import(+ve)/Export(-ve); OD(+)/UD(-)
Schedule(MU)D. Transnational Exchanges (MU) - Import(+ve)/Export(-ve)   
Actual (MU)Energy Shortage (MU)
Maximum Demand Met During the Day (MW) (From NLDC SCADA)
Time Of Maximum Demand Met (From NLDC SCADA)
B. Frequency Profile (%)
C. Power Supply Position in StatesDemand Met during Evening Peak hrs(MW) (at 2000 hrs; from RLDCs)
Peak Shortage (MW)
Energy Met (MU)
Hydro Gen (MU)
Wind Gen (MU)
Solar Gen (MU)*Report for previous day Date of Reporting: 16-Jul-2020
A. Power Supply Position at All India and Regional level

def extract_tableA(pagedata):
    lines = pagedata.split("\n")
    headers = lines[0].strip().split()
    rows  = []
    for line in lines[1:10]:
        rows.append(line.strip().split())
        
    return headers, rows


def extract_tableA(pagedata):
    lines = pagedata.split("\n")
    headers = lines[0].strip().split()
    rows = [line.strip().split() for line in lines[1:10]]
    return headers, rows


def extract_tableA(pagedata):
    lines = pagedata.split("\n")
    data = [line.strip().split() for line in lines[:10]]
    return data[0], data[1:]

pagedata = extractPage("powerdata.pdf", 1)
headers, rows = extract_tableA(pagedata)

headers

['NR', 'WR', 'SR', 'ER', 'NER', 'TOTAL']

rows

[['59882', '41115', '34238', '21526', '2730', '159491'],
 ['1114', '0', '0', '0', '6', '1120'],
 ['1398', '998', '807', '447', '48', '3698'],
 ['355', '33', '77', '149', '29', '643'],
 ['11', '49', '128', '-', '-', '187'],
 ['39.60', '16.60', '41.59', '4.60', '0.03', '102'],
 ['12.6', '0.0', '0.0', '0.0', '0.0', '12.6'],
 ['65470', '43593', '38117', '21535', '2827', '160654'],
 ['22:20', '10:29', '10:00', '21:20', '19:41', '21:26']]

import pandas as pd
pd.DataFrame([{"c1":1, "c2":2, "c3":3},
             {"c1":21, "c2":22, "c3":23},
             {"c1":31, "c2":32, "c3":33},
             {"c1":41, "c2":42, "c3":43}])

	c1	c2	c3
0	1	2	3
1	21	22	23
2	31	32	33
3	41	42	43

dict(zip(headers, rows[0]))

{'NR': '59882',
 'WR': '41115',
 'SR': '34238',
 'ER': '21526',
 'NER': '2730',
 'TOTAL': '159491'}

{k:v for k,v in zip(headers, rows[0])}

{'NR': '59882',
 'WR': '41115',
 'SR': '34238',
 'ER': '21526',
 'NER': '2730',
 'TOTAL': '159491'}

[dict(zip(headers,row)) for row in rows]

[{'NR': '59882',
  'WR': '41115',
  'SR': '34238',
  'ER': '21526',
  'NER': '2730',
  'TOTAL': '159491'},
 {'NR': '1114', 'WR': '0', 'SR': '0', 'ER': '0', 'NER': '6', 'TOTAL': '1120'},
 {'NR': '1398',
  'WR': '998',
  'SR': '807',
  'ER': '447',
  'NER': '48',
  'TOTAL': '3698'},
 {'NR': '355',
  'WR': '33',
  'SR': '77',
  'ER': '149',
  'NER': '29',
  'TOTAL': '643'},
 {'NR': '11', 'WR': '49', 'SR': '128', 'ER': '-', 'NER': '-', 'TOTAL': '187'},
 {'NR': '39.60',
  'WR': '16.60',
  'SR': '41.59',
  'ER': '4.60',
  'NER': '0.03',
  'TOTAL': '102'},
 {'NR': '12.6',
  'WR': '0.0',
  'SR': '0.0',
  'ER': '0.0',
  'NER': '0.0',
  'TOTAL': '12.6'},
 {'NR': '65470',
  'WR': '43593',
  'SR': '38117',
  'ER': '21535',
  'NER': '2827',
  'TOTAL': '160654'},
 {'NR': '22:20',
  'WR': '10:29',
  'SR': '10:00',
  'ER': '21:20',
  'NER': '19:41',
  'TOTAL': '21:26'}]

pd.DataFrame([dict(zip(headers,row)) for row in rows])

	NR	WR	SR	ER	NER	TOTAL
0	59882	41115	34238	21526	2730	159491
1	1114	0	0	0	6	1120
2	1398	998	807	447	48	3698
3	355	33	77	149	29	643
4	11	49	128	-	-	187
5	39.60	16.60	41.59	4.60	0.03	102
6	12.6	0.0	0.0	0.0	0.0	12.6
7	65470	43593	38117	21535	2827	160654
8	22:20	10:29	10:00	21:20	19:41	21:26

Why should we write to a specialized program to read this particular table?

so that we should not repeat the effort of decoding the data manually
condition is data should come in similar format (there is some logic that you can figure out)
Do not write python automation for problems which do not follow logic of how data is stored!

pd.DataFrame({"a":[1, 2, 3, 4],
             "b":[2, 3, 4, 5],
             "c":[23, 23, 34, 54]},
            index=["X","Y","Z","W"])

	a	b	c
X	1	2	23
Y	2	3	23
Z	3	4	34
W	4	5	54

headers, rows

(['NR', 'WR', 'SR', 'ER', 'NER', 'TOTAL'],
 [['59882', '41115', '34238', '21526', '2730', '159491'],
  ['1114', '0', '0', '0', '6', '1120'],
  ['1398', '998', '807', '447', '48', '3698'],
  ['355', '33', '77', '149', '29', '643'],
  ['11', '49', '128', '-', '-', '187'],
  ['39.60', '16.60', '41.59', '4.60', '0.03', '102'],
  ['12.6', '0.0', '0.0', '0.0', '0.0', '12.6'],
  ['65470', '43593', '38117', '21535', '2827', '160654'],
  ['22:20', '10:29', '10:00', '21:20', '19:41', '21:26']])

pd.DataFrame({"NR":[],"WR":[]})

	NR	WR

rowlabels = ["Demand Met during Evening Peak hrs(MW) (at 2000 hrs; from RLDCs)",
             "Peak Shortage (MW)",
            "Energy Met (MU)",
            "Hydro Gen (MU)"
            "Wind Gen (MU)",
            "Solar Gen (MU)*",
             "Energy Shortage (MU)",
            "Maximum Demand Met During the Day (MW) (From NLDC SCADA)",
            "Time Of Maximum Demand Met (From NLDC SCADA)"]

rows

[['59882', '41115', '34238', '21526', '2730', '159491'],
 ['1114', '0', '0', '0', '6', '1120'],
 ['1398', '998', '807', '447', '48', '3698'],
 ['355', '33', '77', '149', '29', '643'],
 ['11', '49', '128', '-', '-', '187'],
 ['39.60', '16.60', '41.59', '4.60', '0.03', '102'],
 ['12.6', '0.0', '0.0', '0.0', '0.0', '12.6'],
 ['65470', '43593', '38117', '21535', '2827', '160654'],
 ['22:20', '10:29', '10:00', '21:20', '19:41', '21:26']]

[row[0] for row in rows]

['59882', '1114', '1398', '355', '11', '39.60', '12.6', '65470', '22:20']

def column(data, i):
    return [row[i] for row in data]

column(rows, 0)

['59882', '1114', '1398', '355', '11', '39.60', '12.6', '65470', '22:20']

dict(zip(headers, [column(rows, i) for i in range(len(headers))]))

{'NR': ['59882',
  '1114',
  '1398',
  '355',
  '11',
  '39.60',
  '12.6',
  '65470',
  '22:20'],
 'WR': ['41115', '0', '998', '33', '49', '16.60', '0.0', '43593', '10:29'],
 'SR': ['34238', '0', '807', '77', '128', '41.59', '0.0', '38117', '10:00'],
 'ER': ['21526', '0', '447', '149', '-', '4.60', '0.0', '21535', '21:20'],
 'NER': ['2730', '6', '48', '29', '-', '0.03', '0.0', '2827', '19:41'],
 'TOTAL': ['159491',
  '1120',
  '3698',
  '643',
  '187',
  '102',
  '12.6',
  '160654',
  '21:26']}

pd.DataFrame(dict(zip(headers, [column(rows, i) for i in range(len(headers))])),
             index=rowlabels)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[44], line 1
----> 1 pd.DataFrame(dict(zip(headers, [column(rows, i) for i in range(len(headers))])),
      2              index=rowlabels)

File /opt/tljh/user/lib/python3.10/site-packages/pandas/core/frame.py:778, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    772     mgr = self._init_mgr(
    773         data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
    774     )
    776 elif isinstance(data, dict):
    777     # GH#38939 de facto copy defaults to False only in non-dict cases
--> 778     mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
    779 elif isinstance(data, ma.MaskedArray):
    780     from numpy.ma import mrecords

File /opt/tljh/user/lib/python3.10/site-packages/pandas/core/internals/construction.py:503, in dict_to_mgr(data, index, columns, dtype, typ, copy)
    499     else:
    500         # dtype check to exclude e.g. range objects, scalars
    501         arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
--> 503 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)

File /opt/tljh/user/lib/python3.10/site-packages/pandas/core/internals/construction.py:119, in arrays_to_mgr(arrays, columns, index, dtype, verify_integrity, typ, consolidate)
    116         index = ensure_index(index)
    118     # don't force copy because getting jammed in an ndarray anyway
--> 119     arrays, refs = _homogenize(arrays, index, dtype)
    120     # _homogenize ensures
    121     #  - all(len(x) == len(index) for x in arrays)
    122     #  - all(x.ndim == 1 for x in arrays)
   (...)
    125 
    126 else:
    127     index = ensure_index(index)

File /opt/tljh/user/lib/python3.10/site-packages/pandas/core/internals/construction.py:630, in _homogenize(data, index, dtype)
    627         val = lib.fast_multiget(val, oindex._values, default=np.nan)
    629     val = sanitize_array(val, index, dtype=dtype, copy=False)
--> 630     com.require_length_match(val, index)
    631     refs.append(None)
    633 homogenized.append(val)

File /opt/tljh/user/lib/python3.10/site-packages/pandas/core/common.py:573, in require_length_match(data, index)
    569 """
    570 Check the length of data matches the length of the index.
    571 """
    572 if len(data) != len(index):
--> 573     raise ValueError(
    574         "Length of values "
    575         f"({len(data)}) "
    576         "does not match length of index "
    577         f"({len(index)})"
    578     )

ValueError: Length of values (9) does not match length of index (8)

rowlabels

['Demand Met during Evening Peak hrs(MW) (at 2000 hrs; from RLDCs)',
 'Peak Shortage (MW)',
 'Energy Met (MU)',
 'Hydro Gen (MU)Wind Gen (MU)',
 'Solar Gen (MU)*',
 'Energy Shortage (MU)',
 'Maximum Demand Met During the Day (MW) (From NLDC SCADA)',
 'Time Of Maximum Demand Met (From NLDC SCADA)']

rowlabels = ["Demand Met during Evening Peak hrs(MW) (at 2000 hrs; from RLDCs)",
             "Peak Shortage (MW)",
            "Energy Met (MU)",
            "Hydro Gen (MU)",
            "Wind Gen (MU)",
            "Solar Gen (MU)*",
             "Energy Shortage (MU)",
            "Maximum Demand Met During the Day (MW) (From NLDC SCADA)",
            "Time Of Maximum Demand Met (From NLDC SCADA)"]

pd.DataFrame(dict(zip(headers, [column(rows, i) for i in range(len(headers))])),
             index=rowlabels)

	NR	WR	SR	ER	NER	TOTAL
Demand Met during Evening Peak hrs(MW) (at 2000 hrs; from RLDCs)	59882	41115	34238	21526	2730	159491
Peak Shortage (MW)	1114	0	0	0	6	1120
Energy Met (MU)	1398	998	807	447	48	3698
Hydro Gen (MU)	355	33	77	149	29	643
Wind Gen (MU)	11	49	128	-	-	187
Solar Gen (MU)*	39.60	16.60	41.59	4.60	0.03	102
Energy Shortage (MU)	12.6	0.0	0.0	0.0	0.0	12.6
Maximum Demand Met During the Day (MW) (From NLDC SCADA)	65470	43593	38117	21535	2827	160654
Time Of Maximum Demand Met (From NLDC SCADA)	22:20	10:29	10:00	21:20	19:41	21:26

Reading and Writing to databases

import sqlite3

conn = sqlite3.connect("traning.db") # if the file database does not exist , it will create it

cur = conn.cursor()

cur.execute("create table person (name varchar(100), email varchar(100));")

<sqlite3.Cursor at 0x7f8d4404af40>

cur.execute("insert into person (name, email) values ('alice', 'alice@wonder.land');")

<sqlite3.Cursor at 0x7f8d4404af40>

conn.commit() # it is not gauranteed that data is written unless we call commit statement

conn.close()

Access

conn = sqlite3.connect("traning.db")

cur = conn.cursor()
r = cur.execute("select * from person")

<sqlite3.Cursor at 0x7f8d4404b840>

r.fetchall() # this is iterator!

[('alice', 'alice@wonder.land')]

r.fetchall() # it is already consumed

[]

r1 = cur.execute("select * from person") # this returns handle to an iterator
r1.fetchall()

[('alice', 'alice@wonder.land')]

def find_person(conn, name):
    c = conn.cursor()
    r = c.execute(f"select * from person where name='{name}'")
    return r.fetchall()

find_person(conn, "alice")

[('alice', 'alice@wonder.land')]

conn.cursor().execute("insert into person (name, email) values ('alex', 'alex@nyzoo.com');")

<sqlite3.Cursor at 0x7f8d3ab136c0>

conn.commit()

find_person(conn, "alex")

[('alex', 'alex@nyzoo.com')]

conn.cursor().execute("insert into person (name, email) values ('alex', 'alex@nyzoo.com');")
conn.commit()

find_person(conn, "alex")

[('alex', 'alex@nyzoo.com'), ('alex', 'alex@nyzoo.com')]

(1, 2)

(1, 2)

(1) # this is not a tuple

(1,) # this is a tuple with single value

(1,)

Safe way of executing a query

def find_person_(conn, name):
    cursor = conn.cursor()
    r = cursor.execute("select * from person where name=?", (name,))
    return r.fetchall()

find_person_(conn, name='alex')

[('alex', 'alex@nyzoo.com'), ('alex', 'alex@nyzoo.com')]

find_person_(conn, name='alic')

[]

find_person_(conn, name='alice')

[('alice', 'alice@wonder.land')]

def safe_query(conn, querystring, params=None):
    cur = conn.cursor()
    r = cur.execute(querystring, params)
    return r.fetchall()

Pandas and Database

import pandas as pd

wallet = pd.read_excel("wallet.xlsx")

wallet

	Unnamed: 0	date	category	description	debit
0	0	2021-03-07 14:53:28.377359	Music	Amazon	421.207327
1	1	2020-10-08 09:53:28.377359	Food	Swiggy	328.440080
2	2	2021-02-23 09:53:28.377359	Books	Amazon	244.679437
3	3	2020-11-01 14:53:28.377359	Utility	Phone	222.756318
4	4	2021-06-05 13:53:28.377359	Books	Flipcart	494.128492
...	...	...	...	...	...
95	95	2021-07-19 13:53:28.377359	Utility	Phone	388.671213
96	96	2021-01-12 19:53:28.377359	Books	Flipcart	467.554562
97	97	2021-03-25 11:53:28.377359	Utility	Phone	320.789434
98	98	2021-05-13 15:53:28.377359	Travel	Taxi	442.096469
99	99	2020-10-11 16:53:28.377359	Food	Hotel	100.455501

100 rows × 5 columns

wallet

	Unnamed: 0	date	category	description	debit
0	0	2021-03-07 14:53:28.377359	Music	Amazon	421.207327
1	1	2020-10-08 09:53:28.377359	Food	Swiggy	328.440080
2	2	2021-02-23 09:53:28.377359	Books	Amazon	244.679437
3	3	2020-11-01 14:53:28.377359	Utility	Phone	222.756318
4	4	2021-06-05 13:53:28.377359	Books	Flipcart	494.128492
...	...	...	...	...	...
95	95	2021-07-19 13:53:28.377359	Utility	Phone	388.671213
96	96	2021-01-12 19:53:28.377359	Books	Flipcart	467.554562
97	97	2021-03-25 11:53:28.377359	Utility	Phone	320.789434
98	98	2021-05-13 15:53:28.377359	Travel	Taxi	442.096469
99	99	2020-10-11 16:53:28.377359	Food	Hotel	100.455501

100 rows × 5 columns

del wallet['Unnamed: 0']

wallet

	date	category	description	debit
0	2021-03-07 14:53:28.377359	Music	Amazon	421.207327
1	2020-10-08 09:53:28.377359	Food	Swiggy	328.440080
2	2021-02-23 09:53:28.377359	Books	Amazon	244.679437
3	2020-11-01 14:53:28.377359	Utility	Phone	222.756318
4	2021-06-05 13:53:28.377359	Books	Flipcart	494.128492
...	...	...	...	...
95	2021-07-19 13:53:28.377359	Utility	Phone	388.671213
96	2021-01-12 19:53:28.377359	Books	Flipcart	467.554562
97	2021-03-25 11:53:28.377359	Utility	Phone	320.789434
98	2021-05-13 15:53:28.377359	Travel	Taxi	442.096469
99	2020-10-11 16:53:28.377359	Food	Hotel	100.455501

100 rows × 4 columns

wallet.to_sql("expnses", con=conn, if_exists="replace") # be careful it will replace the table data!

safe_query(conn, "select * from expnses where category=?", ("Music",))

[(0, '2021-03-07 14:53:28.377359', 'Music', 'Amazon', 421.207327234799),
 (12, '2021-07-24 14:53:28.377359', 'Music', 'Amazon', 329.536003189757),
 (23, '2020-12-11 10:53:28.377359', 'Music', 'Netflix', 354.940240991982),
 (37, '2021-04-18 16:53:28.377359', 'Music', 'Amazon', 266.069078377467),
 (49, '2021-03-16 09:53:28.377359', 'Music', 'spotify', 232.303402191211),
 (54, '2020-11-16 10:53:28.377359', 'Music', 'spotify', 160.817543407684),
 (65, '2020-10-23 18:53:28.377359', 'Music', 'Netflix', 188.748742689512),
 (67, '2021-07-31 14:53:28.377359', 'Music', 'Netflix', 324.786916846731),
 (76, '2020-11-17 09:53:28.377359', 'Music', 'Netflix', 197.53460001679),
 (78, '2020-09-09 16:53:28.377359', 'Music', 'spotify', 415.37289380353),
 (79, '2021-08-17 09:53:28.377359', 'Music', 'Netflix', 321.763415654465),
 (82, '2020-09-22 09:53:28.377359', 'Music', 'spotify', 411.142701208422),
 (84, '2020-09-21 10:53:28.377359', 'Music', 'Netflix', 158.793645726933),
 (85, '2021-01-12 09:53:28.377359', 'Music', 'Amazon', 130.37490757527),
 (89, '2021-04-12 14:53:28.377359', 'Music', 'Amazon', 218.487173429263),
 (90, '2020-12-01 14:53:28.377359', 'Music', 'Amazon', 101.573275888894)]

safe_query(conn, "select * from person where name=?", ("alice",))

[('alice', 'alice@wonder.land')]

safe_query(conn, "select * from expnses where description=?", ("Amazon",))

[(0, '2021-03-07 14:53:28.377359', 'Music', 'Amazon', 421.207327234799),
 (2, '2021-02-23 09:53:28.377359', 'Books', 'Amazon', 244.679437015114),
 (12, '2021-07-24 14:53:28.377359', 'Music', 'Amazon', 329.536003189757),
 (24, '2021-05-31 11:53:28.377359', 'Books', 'Amazon', 498.100495504611),
 (37, '2021-04-18 16:53:28.377359', 'Music', 'Amazon', 266.069078377467),
 (72, '2021-06-30 18:53:28.377359', 'Books', 'Amazon', 294.662868990049),
 (85, '2021-01-12 09:53:28.377359', 'Music', 'Amazon', 130.37490757527),
 (89, '2021-04-12 14:53:28.377359', 'Music', 'Amazon', 218.487173429263),
 (90, '2020-12-01 14:53:28.377359', 'Music', 'Amazon', 101.573275888894)]

wallet.to_sql("expnses", con=conn, if_exists="append")

safe_query(conn, "select * from expnses where description=?", ("Amazon",))

[(0, '2021-03-07 14:53:28.377359', 'Music', 'Amazon', 421.207327234799),
 (2, '2021-02-23 09:53:28.377359', 'Books', 'Amazon', 244.679437015114),
 (12, '2021-07-24 14:53:28.377359', 'Music', 'Amazon', 329.536003189757),
 (24, '2021-05-31 11:53:28.377359', 'Books', 'Amazon', 498.100495504611),
 (37, '2021-04-18 16:53:28.377359', 'Music', 'Amazon', 266.069078377467),
 (72, '2021-06-30 18:53:28.377359', 'Books', 'Amazon', 294.662868990049),
 (85, '2021-01-12 09:53:28.377359', 'Music', 'Amazon', 130.37490757527),
 (89, '2021-04-12 14:53:28.377359', 'Music', 'Amazon', 218.487173429263),
 (90, '2020-12-01 14:53:28.377359', 'Music', 'Amazon', 101.573275888894),
 (0, '2021-03-07 14:53:28.377359', 'Music', 'Amazon', 421.207327234799),
 (2, '2021-02-23 09:53:28.377359', 'Books', 'Amazon', 244.679437015114),
 (12, '2021-07-24 14:53:28.377359', 'Music', 'Amazon', 329.536003189757),
 (24, '2021-05-31 11:53:28.377359', 'Books', 'Amazon', 498.100495504611),
 (37, '2021-04-18 16:53:28.377359', 'Music', 'Amazon', 266.069078377467),
 (72, '2021-06-30 18:53:28.377359', 'Books', 'Amazon', 294.662868990049),
 (85, '2021-01-12 09:53:28.377359', 'Music', 'Amazon', 130.37490757527),
 (89, '2021-04-12 14:53:28.377359', 'Music', 'Amazon', 218.487173429263),
 (90, '2020-12-01 14:53:28.377359', 'Music', 'Amazon', 101.573275888894)]

pd.read_sql("select * from expnses", conn)

	index	date	category	description	debit
0	0	2021-03-07 14:53:28.377359	Music	Amazon	421.207327
1	1	2020-10-08 09:53:28.377359	Food	Swiggy	328.440080
2	2	2021-02-23 09:53:28.377359	Books	Amazon	244.679437
3	3	2020-11-01 14:53:28.377359	Utility	Phone	222.756318
4	4	2021-06-05 13:53:28.377359	Books	Flipcart	494.128492
...	...	...	...	...	...
195	95	2021-07-19 13:53:28.377359	Utility	Phone	388.671213
196	96	2021-01-12 19:53:28.377359	Books	Flipcart	467.554562
197	97	2021-03-25 11:53:28.377359	Utility	Phone	320.789434
198	98	2021-05-13 15:53:28.377359	Travel	Taxi	442.096469
199	99	2020-10-11 16:53:28.377359	Food	Hotel	100.455501

200 rows × 5 columns

Only for Windows Code snippets

Microsoft sql server

you will need a package called pyodbc

install the package in virtualenv (preferable)

pip install pyodbc

import pyodbc
cnxn = pyodbc.connect(r'Driver={SQL Server};Server=.\SQLEXPRESS;Database=myDB;Trusted_Connection=yes;')
cursor = cnxn.cursor()
cursor.execute("SELECT LastName FROM myContacts")
while 1:
    row = cursor.fetchone()
    if not row:
        break
    print(row.LastName)
cnxn.close()

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[105], line 1
----> 1 import pyodbc
      2 cnxn = pyodbc.connect(r'Driver={SQL Server};Server=.\SQLEXPRESS;Database=myDB;Trusted_Connection=yes;')
      3 cursor = cnxn.cursor()

ModuleNotFoundError: No module named 'pyodbc'