Mar 13-17, 2023 Vikrant Patil
All notes are available online at https://notes.pipal.in/2023/arcesium_finop_jan/
© Pipal Academy LLP
Write a function combined_daily_time_series that combines stock data from www.alphavantage.co. for given list of symbols and returns a dataframe. The dataframe should have these numeric columns, open, high, low, close, volume, symbol. Here are some hints. Have a look at these functions and methods pd.concat, DataFrame.rename, pd.to_numeric
Also write a function get_total_volume which takes dataframe generated by above function and returns a series that symbols as row labels and total volume for that symbol as a value.
>>> df = combined_daily_time_series(["AAPL","IBM"])
>>> df
open high low close volume symbol
2022-08-25 20:00:00 170.290 170.3500 170.2000 170.2000 11160 AAPL
2022-08-25 20:15:00 170.290 170.3500 170.2000 170.2000 11160 AAPL
2022-08-25 20:30:00 170.290 170.3500 170.2000 170.2000 11160 AAPL
.
.
2022-08-25 20:00:00 170.290 170.3500 170.2000 170.2000 11160 IBM
2022-08-25 20:15:00 170.290 170.3500 170.2000 170.2000 11160 IBM
200 rows × 6 columns
>>> get_total_volume(df)
symbol
AAPL 77772389
IBM 7689538
Name: volume, dtype: int64
import requests
url = 'https://www.alphavantage.co/query'
API_KEY = "UKVFE0JLE0TBPDEF"
params = {"function":"TIME_SERIES_INTRADAY",
"apikey":API_KEY,
"symbol":"IBM",
"interval":"15min",
}
r = requests.get(url, params)
r
<Response [200]>
import pandas as pd
pd.DataFrame(r.json()['Time Series (15min)']).transpose()
| 1. open | 2. high | 3. low | 4. close | 5. volume | |
|---|---|---|---|---|---|
| 2023-03-29 18:30:00 | 129.5800 | 129.5800 | 129.5800 | 129.5800 | 100 |
| 2023-03-29 18:15:00 | 129.7200 | 129.7200 | 129.5800 | 129.5800 | 200 |
| 2023-03-29 17:45:00 | 129.7200 | 129.7200 | 129.7200 | 129.7200 | 100 |
| 2023-03-29 17:30:00 | 129.6500 | 129.6500 | 129.6500 | 129.6500 | 200 |
| 2023-03-29 17:15:00 | 129.6000 | 129.6000 | 129.6000 | 129.6000 | 301 |
| ... | ... | ... | ... | ... | ... |
| 2023-03-27 13:15:00 | 129.3200 | 129.4350 | 129.1600 | 129.2400 | 103222 |
| 2023-03-27 13:00:00 | 129.2900 | 129.3250 | 129.1938 | 129.3100 | 85797 |
| 2023-03-27 12:45:00 | 129.2600 | 129.3100 | 129.1700 | 129.2500 | 79987 |
| 2023-03-27 12:30:00 | 129.0100 | 129.2800 | 128.9690 | 129.2500 | 123038 |
| 2023-03-27 12:15:00 | 128.9600 | 129.0900 | 128.9100 | 129.0141 | 117268 |
100 rows × 5 columns
def get_data(ticker):
url = 'https://www.alphavantage.co/query'
API_KEY = "UKVFE0JLE0TBPDEF"
params = {"function":"TIME_SERIES_INTRADAY",
"apikey":API_KEY,
"symbol":ticker,
"interval":"15min",
}
r = requests.get(url, params)
df = pd.DataFrame(r.json()['Time Series (15min)']).transpose()
df['symbol'] = ticker # will add a new column symbol
return df
def combined_daily_time_series(tickers):
dataframes = [get_data(t) for t in tickers]
return pd.concat(dataframes)
combined_daily_time_series(["IBM"])
| 1. open | 2. high | 3. low | 4. close | 5. volume | symbol | |
|---|---|---|---|---|---|---|
| 2023-03-29 18:30:00 | 129.5800 | 129.5800 | 129.5800 | 129.5800 | 100 | IBM |
| 2023-03-29 18:15:00 | 129.7200 | 129.7200 | 129.5800 | 129.5800 | 200 | IBM |
| 2023-03-29 17:45:00 | 129.7200 | 129.7200 | 129.7200 | 129.7200 | 100 | IBM |
| 2023-03-29 17:30:00 | 129.6500 | 129.6500 | 129.6500 | 129.6500 | 200 | IBM |
| 2023-03-29 17:15:00 | 129.6000 | 129.6000 | 129.6000 | 129.6000 | 301 | IBM |
| ... | ... | ... | ... | ... | ... | ... |
| 2023-03-27 13:15:00 | 129.3200 | 129.4350 | 129.1600 | 129.2400 | 103222 | IBM |
| 2023-03-27 13:00:00 | 129.2900 | 129.3250 | 129.1938 | 129.3100 | 85797 | IBM |
| 2023-03-27 12:45:00 | 129.2600 | 129.3100 | 129.1700 | 129.2500 | 79987 | IBM |
| 2023-03-27 12:30:00 | 129.0100 | 129.2800 | 128.9690 | 129.2500 | 123038 | IBM |
| 2023-03-27 12:15:00 | 128.9600 | 129.0900 | 128.9100 | 129.0141 | 117268 | IBM |
100 rows × 6 columns
combined_daily_time_series(["IBM","AAPL"])
| 1. open | 2. high | 3. low | 4. close | 5. volume | symbol | |
|---|---|---|---|---|---|---|
| 2023-03-29 18:30:00 | 129.5800 | 129.5800 | 129.5800 | 129.5800 | 100 | IBM |
| 2023-03-29 18:15:00 | 129.7200 | 129.7200 | 129.5800 | 129.5800 | 200 | IBM |
| 2023-03-29 17:45:00 | 129.7200 | 129.7200 | 129.7200 | 129.7200 | 100 | IBM |
| 2023-03-29 17:30:00 | 129.6500 | 129.6500 | 129.6500 | 129.6500 | 200 | IBM |
| 2023-03-29 17:15:00 | 129.6000 | 129.6000 | 129.6000 | 129.6000 | 301 | IBM |
| ... | ... | ... | ... | ... | ... | ... |
| 2023-03-28 12:15:00 | 156.9200 | 156.9500 | 156.5400 | 156.6800 | 892032 | AAPL |
| 2023-03-28 12:00:00 | 156.9300 | 157.0300 | 156.7502 | 156.9100 | 1203419 | AAPL |
| 2023-03-28 11:45:00 | 156.6000 | 157.0000 | 156.5750 | 156.9200 | 1466377 | AAPL |
| 2023-03-28 11:30:00 | 156.7750 | 156.8900 | 156.3900 | 156.6000 | 1258754 | AAPL |
| 2023-03-28 11:15:00 | 156.4400 | 156.9000 | 156.4050 | 156.7776 | 1387565 | AAPL |
200 rows × 6 columns
def get_data(ticker):
url = 'https://www.alphavantage.co/query'
API_KEY = "UKVFE0JLE0TBPDEF"
params = {"function":"TIME_SERIES_INTRADAY",
"apikey":API_KEY,
"symbol":ticker,
"interval":"15min",
}
r = requests.get(url, params)
df = pd.DataFrame(r.json()['Time Series (15min)']).transpose()
df['symbol'] = ticker # will add a new column symbol
oldnames = ['1. open', '2. high', '3. low', '4. close', '5. volume']
newnames = ["open", "high", "low", "close", "volume"]
df = df.rename(columns=dict(zip(oldnames,newnames)))
for c in newnames:
df[c] = pd.to_numeric(df[c])
return df
def combined_daily_time_series(tickers):
dataframes = [get_data(t) for t in tickers]
return pd.concat(dataframes)
df = combined_daily_time_series(["IBM","AAPL"])
df
| open | high | low | close | volume | symbol | |
|---|---|---|---|---|---|---|
| 2023-03-29 18:30:00 | 129.580 | 129.58 | 129.5800 | 129.5800 | 100 | IBM |
| 2023-03-29 18:15:00 | 129.720 | 129.72 | 129.5800 | 129.5800 | 200 | IBM |
| 2023-03-29 17:45:00 | 129.720 | 129.72 | 129.7200 | 129.7200 | 100 | IBM |
| 2023-03-29 17:30:00 | 129.650 | 129.65 | 129.6500 | 129.6500 | 200 | IBM |
| 2023-03-29 17:15:00 | 129.600 | 129.60 | 129.6000 | 129.6000 | 301 | IBM |
| ... | ... | ... | ... | ... | ... | ... |
| 2023-03-28 12:15:00 | 156.920 | 156.95 | 156.5400 | 156.6800 | 892032 | AAPL |
| 2023-03-28 12:00:00 | 156.930 | 157.03 | 156.7502 | 156.9100 | 1203419 | AAPL |
| 2023-03-28 11:45:00 | 156.600 | 157.00 | 156.5750 | 156.9200 | 1466377 | AAPL |
| 2023-03-28 11:30:00 | 156.775 | 156.89 | 156.3900 | 156.6000 | 1258754 | AAPL |
| 2023-03-28 11:15:00 | 156.440 | 156.90 | 156.4050 | 156.7776 | 1387565 | AAPL |
200 rows × 6 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 200 entries, 2023-03-29 18:30:00 to 2023-03-28 11:15:00 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 open 200 non-null float64 1 high 200 non-null float64 2 low 200 non-null float64 3 close 200 non-null float64 4 volume 200 non-null int64 5 symbol 200 non-null object dtypes: float64(4), int64(1), object(1) memory usage: 10.9+ KB
import pandas as pd
import requests
def combined_daily_time_series(symbols):
dfs = []
for symbol in symbols:
url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&outputsize=full&apikey=<FG5B837V8HPIRTOO>"
response = requests.get(url)
data = response.json()["Time Series (Daily)"]
df = pd.DataFrame.from_dict(data, orient="index")
df = df.rename(columns={
"1. open": "open",
"2. high": "high",
"3. low": "low",
"4. close": "close",
"6. volume": "volume"
})
df["symbol"] = symbol
dfs.append(df)
combined_df = pd.concat(dfs)
combined_df = combined_df.apply(pd.to_numeric) # this will apply even to 'symbol'
return combined_df
def get_total_volume(df):
return df.groupby("symbol")["volume"].sum()
| open | high | low | close | volume | symbol | |
|---|---|---|---|---|---|---|
| 2023-03-29 18:30:00 | 129.580 | 129.58 | 129.5800 | 129.5800 | 100 | IBM |
| 2023-03-29 18:15:00 | 129.720 | 129.72 | 129.5800 | 129.5800 | 200 | IBM |
| 2023-03-29 17:45:00 | 129.720 | 129.72 | 129.7200 | 129.7200 | 100 | IBM |
| 2023-03-29 17:30:00 | 129.650 | 129.65 | 129.6500 | 129.6500 | 200 | IBM |
| 2023-03-29 17:15:00 | 129.600 | 129.60 | 129.6000 | 129.6000 | 301 | IBM |
| ... | ... | ... | ... | ... | ... | ... |
| 2023-03-28 12:15:00 | 156.920 | 156.95 | 156.5400 | 156.6800 | 892032 | AAPL |
| 2023-03-28 12:00:00 | 156.930 | 157.03 | 156.7502 | 156.9100 | 1203419 | AAPL |
| 2023-03-28 11:45:00 | 156.600 | 157.00 | 156.5750 | 156.9200 | 1466377 | AAPL |
| 2023-03-28 11:30:00 | 156.775 | 156.89 | 156.3900 | 156.6000 | 1258754 | AAPL |
| 2023-03-28 11:15:00 | 156.440 | 156.90 | 156.4050 | 156.7776 | 1387565 | AAPL |
200 rows × 6 columns
df
| open | high | low | close | volume | symbol | |
|---|---|---|---|---|---|---|
| 2023-03-29 18:30:00 | 129.580 | 129.58 | 129.5800 | 129.5800 | 100 | IBM |
| 2023-03-29 18:15:00 | 129.720 | 129.72 | 129.5800 | 129.5800 | 200 | IBM |
| 2023-03-29 17:45:00 | 129.720 | 129.72 | 129.7200 | 129.7200 | 100 | IBM |
| 2023-03-29 17:30:00 | 129.650 | 129.65 | 129.6500 | 129.6500 | 200 | IBM |
| 2023-03-29 17:15:00 | 129.600 | 129.60 | 129.6000 | 129.6000 | 301 | IBM |
| ... | ... | ... | ... | ... | ... | ... |
| 2023-03-28 12:15:00 | 156.920 | 156.95 | 156.5400 | 156.6800 | 892032 | AAPL |
| 2023-03-28 12:00:00 | 156.930 | 157.03 | 156.7502 | 156.9100 | 1203419 | AAPL |
| 2023-03-28 11:45:00 | 156.600 | 157.00 | 156.5750 | 156.9200 | 1466377 | AAPL |
| 2023-03-28 11:30:00 | 156.775 | 156.89 | 156.3900 | 156.6000 | 1258754 | AAPL |
| 2023-03-28 11:15:00 | 156.440 | 156.90 | 156.4050 | 156.7776 | 1387565 | AAPL |
200 rows × 6 columns
import pandas as pd
import requests
def combined_daily_time_series(symbols):
dfs = []
for symbol in symbols:
url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&outputsize=full&apikey=<FG5B837V8HPIRTOO>"
response = requests.get(url)
data = response.json()["Time Series (Daily)"]
df = pd.DataFrame.from_dict(data, orient="index")
df = df.rename(columns={
"1. open": "open",
"2. high": "high",
"3. low": "low",
"4. close": "close",
"6. volume": "volume"
})
df["symbol"] = symbol
dfs.append(df)
combined_df = pd.concat(dfs)
combined_df = combined_df.apply(pd.to_numeric)
return combined_df
def get_total_volume(df):
return df.groupby("symbol")["volume"].sum()
get_total_volume(df)
symbol AAPL 72659191 IBM 8614473 Name: volume, dtype: int64
def get_max_volume(df):
return df.groupby("symbol")["volume"].max()
get_max_volume(df)
symbol AAPL 4985474 IBM 952777 Name: volume, dtype: int64
Write a function latest_jobs which will serach for job at given location in arcesium job portal. Arcesium jobs are listed on "https://careers.arcesium.com/go/All-Jobs/4687610/" . Scrape this page and get a dataframe which contains jobs at given location from latest 25 jobs. The dataframe should have Title and Location
>>> latest_jobs("London")
Title Location
0 Solutions Architect-Private Markets Solutions London, ENG, GB
1 Solutions Architect Solutions Architect Lond London, ENG, GB
Downloding data from internet
def latest_jobs(location):
url = "https://careers.arcesium.com/go/All-Jobs/4687610/"
return pd.read_html(url)
latest_jobs("sdometjhr")
[ Title \
0 Senior Software Engineer - Data Platform Seni...
1 Principal Engineer Principal Engineer Hydera...
2 Senior Principal Engineer Senior Principal En...
3 Principal Engineer - Front End Principal Engi...
4 Sales Executive - Institutional Asset Managers...
5 Product Lead - Compliance Product Lead - Comp...
6 Designer-Returnship Designer-Returnship Hyde...
7 Technical Lead Technical Lead Hyderabad, TG,...
8 Business Strategy Lead Business Strategy Lead...
9 Associate Consultant - PCG - PNS Associate Co...
10 Senior Analyst, Sales Ops Senior Analyst, Sal...
11 Senior Specialist, Admin Senior Specialist, A...
12 Associate Director - Test Engineering Associa...
13 Principal Engineer - Linux and Infrastructure ...
14 Distributed Systems Engineer (Remote Eligible)...
15 Senior Specialist-MCA Senior Specialist-MCA ...
16 Manager, Sales Enablement Manager, Sales Enab...
17 Sr Manager, Learning and Development Sr Manag...
18 Manager, Organization Development Manager, Or...
19 Software Engineer - Forward Deployed Team (Rem...
20 Technical Writer-Returnship Technical Writer-...
21 Manager, PMO Manager, PMO Gurugram, HR, IN ...
22 Specialist, HR Business Partner Specialist, H...
23 Principal Engineer - Machine Learning Princip...
24 Solutions Architect - Treasury (Functional), S...
Location
0 New York, NY, US
1 Hyderabad, TG, IN +2 more…
2 Hyderabad, TG, IN +2 more…
3 Hyderabad, TG, IN +2 more…
4 New York, NY, US
5 Hyderabad, TG, IN
6 Hyderabad, TG, IN +2 more…
7 Hyderabad, TG, IN +2 more…
8 New York, NY, US
9 Hyderabad, TG, IN
10 Hyderabad, TG, IN
11 Bengaluru, KA, IN
12 Bengaluru, KA, IN +1 more…
13 Gurugram, HR, IN +2 more…
14 New York, NY, US
15 Bengaluru, KA, IN
16 Hyderabad, TG, IN
17 Hyderabad, TG, IN
18 Hyderabad, TG, IN
19 New York, NY, US
20 Hyderabad, TG, IN +2 more…
21 Gurugram, HR, IN
22 Gurugram, HR, IN
23 Hyderabad, TG, IN
24 Hyderabad, TG, IN +2 more… ]
def latest_jobs(location):
url = "https://careers.arcesium.com/go/All-Jobs/4687610/"
return pd.read_html(url)[0]
df = latest_jobs("ddsf")
df.columns
Index(['Title', 'Location'], dtype='object')
df[df.Location == "Hyderabad, TG, IN"]
| Title | Location | |
|---|---|---|
| 5 | Product Lead - Compliance Product Lead - Comp... | Hyderabad, TG, IN |
| 9 | Associate Consultant - PCG - PNS Associate Co... | Hyderabad, TG, IN |
| 10 | Senior Analyst, Sales Ops Senior Analyst, Sal... | Hyderabad, TG, IN |
| 16 | Manager, Sales Enablement Manager, Sales Enab... | Hyderabad, TG, IN |
| 17 | Sr Manager, Learning and Development Sr Manag... | Hyderabad, TG, IN |
| 18 | Manager, Organization Development Manager, Or... | Hyderabad, TG, IN |
| 23 | Principal Engineer - Machine Learning Princip... | Hyderabad, TG, IN |
df[df.Location.str.contains("Hyderabad")]
| Title | Location | |
|---|---|---|
| 1 | Principal Engineer Principal Engineer Hydera... | Hyderabad, TG, IN +2 more… |
| 2 | Senior Principal Engineer Senior Principal En... | Hyderabad, TG, IN +2 more… |
| 3 | Principal Engineer - Front End Principal Engi... | Hyderabad, TG, IN +2 more… |
| 5 | Product Lead - Compliance Product Lead - Comp... | Hyderabad, TG, IN |
| 6 | Designer-Returnship Designer-Returnship Hyde... | Hyderabad, TG, IN +2 more… |
| 7 | Technical Lead Technical Lead Hyderabad, TG,... | Hyderabad, TG, IN +2 more… |
| 9 | Associate Consultant - PCG - PNS Associate Co... | Hyderabad, TG, IN |
| 10 | Senior Analyst, Sales Ops Senior Analyst, Sal... | Hyderabad, TG, IN |
| 16 | Manager, Sales Enablement Manager, Sales Enab... | Hyderabad, TG, IN |
| 17 | Sr Manager, Learning and Development Sr Manag... | Hyderabad, TG, IN |
| 18 | Manager, Organization Development Manager, Or... | Hyderabad, TG, IN |
| 20 | Technical Writer-Returnship Technical Writer-... | Hyderabad, TG, IN +2 more… |
| 23 | Principal Engineer - Machine Learning Princip... | Hyderabad, TG, IN |
| 24 | Solutions Architect - Treasury (Functional), S... | Hyderabad, TG, IN +2 more… |
def latest_jobs(location):
url = "https://careers.arcesium.com/go/All-Jobs/4687610/"
df = pd.read_html(url)[0]
return df[df.Location.str.contains(location)]
latest_jobs("Hyderabad")
| Title | Location | |
|---|---|---|
| 1 | Principal Engineer Principal Engineer Hydera... | Hyderabad, TG, IN +2 more… |
| 2 | Senior Principal Engineer Senior Principal En... | Hyderabad, TG, IN +2 more… |
| 3 | Principal Engineer - Front End Principal Engi... | Hyderabad, TG, IN +2 more… |
| 5 | Product Lead - Compliance Product Lead - Comp... | Hyderabad, TG, IN |
| 6 | Designer-Returnship Designer-Returnship Hyde... | Hyderabad, TG, IN +2 more… |
| 7 | Technical Lead Technical Lead Hyderabad, TG,... | Hyderabad, TG, IN +2 more… |
| 9 | Associate Consultant - PCG - PNS Associate Co... | Hyderabad, TG, IN |
| 10 | Senior Analyst, Sales Ops Senior Analyst, Sal... | Hyderabad, TG, IN |
| 16 | Manager, Sales Enablement Manager, Sales Enab... | Hyderabad, TG, IN |
| 17 | Sr Manager, Learning and Development Sr Manag... | Hyderabad, TG, IN |
| 18 | Manager, Organization Development Manager, Or... | Hyderabad, TG, IN |
| 20 | Technical Writer-Returnship Technical Writer-... | Hyderabad, TG, IN +2 more… |
| 23 | Principal Engineer - Machine Learning Princip... | Hyderabad, TG, IN |
| 24 | Solutions Architect - Treasury (Functional), S... | Hyderabad, TG, IN +2 more… |
import pandas as pd
import requests
def latest_jobs(location):
url = 'https://careers.arcesium.com/go/All-Jobs/4687610/'
response = requests.get(url)
response.raise_for_status()
data = response.text
df = pd.read_html(data, attrs={'class': 'JobListingsTable'})[0]
df = df[['Job Title', 'Location']]
df = df[df['Location'].str.contains(location)]
df.reset_index(drop=True, inplace=True)
return df.head(25)
import pandas as pd
import requests
def latest_jobs(location):
url = 'https://careers.arcesium.com/go/All-Jobs/4687610/'
response = requests.get(url)
response.raise_for_status()
data = response.text
df = pd.read_html(data, attrs={'class': 'JobListingsTable'})[0]
df = df[['Job Title', 'Location']]
df = df[df['Location'].str.contains(location)]
df.reset_index(drop=True, inplace=True)
return df.head(25)
latest_jobs("Hyderabad")
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) Cell In[68], line 1 ----> 1 latest_jobs("Hyderabad") Cell In[67], line 9, in latest_jobs(location) 7 response.raise_for_status() 8 data = response.text ----> 9 df = pd.read_html(data, attrs={'class': 'JobListingsTable'})[0] 10 df = df[['Job Title', 'Location']] 11 df = df[df['Location'].str.contains(location)] File ~/usr/local/default/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(*args, **kwargs) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/html.py:1205, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links) 1201 validate_header_arg(header) 1203 io = stringify_path(io) -> 1205 return _parse( 1206 flavor=flavor, 1207 io=io, 1208 match=match, 1209 header=header, 1210 index_col=index_col, 1211 skiprows=skiprows, 1212 parse_dates=parse_dates, 1213 thousands=thousands, 1214 attrs=attrs, 1215 encoding=encoding, 1216 decimal=decimal, 1217 converters=converters, 1218 na_values=na_values, 1219 keep_default_na=keep_default_na, 1220 displayed_only=displayed_only, 1221 extract_links=extract_links, 1222 ) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/html.py:982, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs) 980 retained = None 981 for flav in flavor: --> 982 parser = _parser_dispatch(flav) 983 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links) 985 try: File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/html.py:931, in _parser_dispatch(flavor) 929 if flavor in ("bs4", "html5lib"): 930 if not _HAS_HTML5LIB: --> 931 raise ImportError("html5lib not found, please install it") 932 if not _HAS_BS4: 933 raise ImportError("BeautifulSoup4 (bs4) not found, please install it") ImportError: html5lib not found, please install it
!pip install html5lib
Collecting html5lib Using cached html5lib-1.1-py2.py3-none-any.whl (112 kB) Requirement already satisfied: webencodings in /home/vikrant/usr/local/default/lib/python3.10/site-packages (from html5lib) (0.5.1) Requirement already satisfied: six>=1.9 in /home/vikrant/usr/local/default/lib/python3.10/site-packages (from html5lib) (1.16.0) Installing collected packages: html5lib Successfully installed html5lib-1.1
import pandas as pd
import requests
def latest_jobs(location):
url = 'https://careers.arcesium.com/go/All-Jobs/4687610/'
response = requests.get(url)
response.raise_for_status()
data = response.text
df = pd.read_html(data, attrs={'class': 'JobListingsTable'})[0]
df = df[['Job Title', 'Location']]
df = df[df['Location'].str.contains(location)]
df.reset_index(drop=True, inplace=True)
return df.head(25)
latest_jobs("Hyderabad")
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[1], line 15 12 df.reset_index(drop=True, inplace=True) 13 return df.head(25) ---> 15 latest_jobs("Hyderabad") Cell In[1], line 9, in latest_jobs(location) 7 response.raise_for_status() 8 data = response.text ----> 9 df = pd.read_html(data, attrs={'class': 'JobListingsTable'})[0] 10 df = df[['Job Title', 'Location']] 11 df = df[df['Location'].str.contains(location)] File ~/usr/local/default/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(*args, **kwargs) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/html.py:1205, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links) 1201 validate_header_arg(header) 1203 io = stringify_path(io) -> 1205 return _parse( 1206 flavor=flavor, 1207 io=io, 1208 match=match, 1209 header=header, 1210 index_col=index_col, 1211 skiprows=skiprows, 1212 parse_dates=parse_dates, 1213 thousands=thousands, 1214 attrs=attrs, 1215 encoding=encoding, 1216 decimal=decimal, 1217 converters=converters, 1218 na_values=na_values, 1219 keep_default_na=keep_default_na, 1220 displayed_only=displayed_only, 1221 extract_links=extract_links, 1222 ) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/html.py:1006, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs) 1004 else: 1005 assert retained is not None # for mypy -> 1006 raise retained 1008 ret = [] 1009 for table in tables: File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/html.py:986, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs) 983 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links) 985 try: --> 986 tables = p.parse_tables() 987 except ValueError as caught: 988 # if `io` is an io-like object, check if it's seekable 989 # and try to rewind it before trying the next parser 990 if hasattr(io, "seekable") and io.seekable(): File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/html.py:262, in _HtmlFrameParser.parse_tables(self) 254 def parse_tables(self): 255 """ 256 Parse and return all tables from the DOM. 257 (...) 260 list of parsed (header, body, footer) tuples from tables. 261 """ --> 262 tables = self._parse_tables(self._build_doc(), self.match, self.attrs) 263 return (self._parse_thead_tbody_tfoot(table) for table in tables) File ~/usr/local/default/lib/python3.10/site-packages/pandas/io/html.py:618, in _BeautifulSoupHtml5LibFrameParser._parse_tables(self, doc, match, attrs) 615 tables = doc.find_all(element_name, attrs=attrs) 617 if not tables: --> 618 raise ValueError("No tables found") 620 result = [] 621 unique_tables = set() ValueError: No tables found
import pandas as pd
import requests
def latest_jobs(location):
url = 'https://careers.arcesium.com/go/All-Jobs/4687610/'
response = requests.get(url)
response.raise_for_status()
data = response.text
df = pd.read_html(data)[0]
df = df[df['Location'].str.contains(location)]
df.reset_index(drop=True, inplace=True)
return df.head(25)
latest_jobs("Hyderabad")
| Title | Location | |
|---|---|---|
| 0 | Principal Engineer Principal Engineer Hydera... | Hyderabad, TG, IN +2 more… |
| 1 | Senior Principal Engineer Senior Principal En... | Hyderabad, TG, IN +2 more… |
| 2 | Principal Engineer - Front End Principal Engi... | Hyderabad, TG, IN +2 more… |
| 3 | Product Lead - Compliance Product Lead - Comp... | Hyderabad, TG, IN |
| 4 | Designer-Returnship Designer-Returnship Hyde... | Hyderabad, TG, IN +2 more… |
| 5 | Technical Lead Technical Lead Hyderabad, TG,... | Hyderabad, TG, IN +2 more… |
| 6 | Associate Consultant - PCG - PNS Associate Co... | Hyderabad, TG, IN |
| 7 | Senior Analyst, Sales Ops Senior Analyst, Sal... | Hyderabad, TG, IN |
| 8 | Manager, Sales Enablement Manager, Sales Enab... | Hyderabad, TG, IN |
| 9 | Sr Manager, Learning and Development Sr Manag... | Hyderabad, TG, IN |
| 10 | Manager, Organization Development Manager, Or... | Hyderabad, TG, IN |
| 11 | Technical Writer-Returnship Technical Writer-... | Hyderabad, TG, IN +2 more… |
| 12 | Principal Engineer - Machine Learning Princip... | Hyderabad, TG, IN |
| 13 | Solutions Architect - Treasury (Functional), S... | Hyderabad, TG, IN +2 more… |