import doctest
import io
import pandas as pd

import doctest
doctest.testmod()

pd.read_csv("earthquakes.csv")

csv = """
Name,Hours
Anna,20
Iris,15
Abiy,10
Gege,12
"""

staff = pd.read_csv(io.StringIO(csv))
staff

staff = staff.set_index("Name")
staff

staff.index

Index(['Anna', 'Iris', 'Abiy', 'Gege'], dtype='object', name='Name')

staff["Hours"]

Name
Anna    20
Iris    15
Abiy    10
Gege    12
Name: Hours, dtype: int64

staff["Hours"].unique()

array([20, 15, 10, 12])

staff["Hours"].describe()

count     4.000000
mean     14.250000
std       4.349329
min      10.000000
25%      11.500000
50%      13.500000
75%      16.250000
max      20.000000
Name: Hours, dtype: float64

staff["Hours"]

Name
Anna    20
Iris    15
Abiy    10
Gege    12
Name: Hours, dtype: int64

# How to select the TA(s) with 10 hours
staff[staff["Hours"] == 10]

staff["Hours"]["Iris"]

15

staff["Hours"].max() - staff["Hours"].min()

10

staff.max() - staff.min()

Hours    10
dtype: int64

staff.min()

Hours    10
dtype: int64

csv = """
City,Country,Emissions,Population
New York,USA,200,1500
Paris,France,48,42
Beijing,China,300,2000
Nice,France,40,60
Seattle,USA,100,1000
"""

emissions = pd.read_csv(io.StringIO(csv), index_col="City")
# index_col="City" is like calling set_index("City"), but a little less coding
emissions

emissions["Emissions"] / emissions["Population"]

City
New York    0.133333
Paris       1.142857
Beijing     0.150000
Nice        0.666667
Seattle     0.100000
dtype: float64

emissions["Population"] + 4

City
New York    1504
Paris         46
Beijing     2004
Nice          64
Seattle     1004
Name: Population, dtype: int64

high_emissions = emissions["Emissions"] >= 200
emissions[high_emissions]

high_emissions

City
New York     True
Paris       False
Beijing      True
Nice        False
Seattle     False
Name: Emissions, dtype: bool

emissions["Country"]

City
New York       USA
Paris       France
Beijing      China
Nice        France
Seattle        USA
Name: Country, dtype: object

"USA" not in list(emissions["Country"]) and "France" in list(emissions["Country"])

False

emissions[high_emissions | (emissions["Country"] == "USA")]

[True, False] & [True, True] # [True, False]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[55], line 1
----> 1 [True, False] & [True, True] # [True, False]

TypeError: unsupported operand type(s) for &: 'list' and 'list'

emissions[(emissions["Country"] == "France") & (emissions["Population"] > 50)]

(emissions["Country"] == "France") & (emissions["Population"] > 50)

City
New York    False
Paris       False
Beijing     False
Nice         True
Seattle     False
dtype: bool

emissions["Country" == "France" and "Population" > 50]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/base.py:3791, in Index.get_loc(self, key)
   3790 try:
-> 3791     return self._engine.get_loc(casted_key)
   3792 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: False

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[54], line 1
----> 1 emissions["Country" == "France" and "Population" > 50]

File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:3893, in DataFrame.__getitem__(self, key)
   3891 if self.columns.nlevels > 1:
   3892     return self._getitem_multilevel(key)
-> 3893 indexer = self.columns.get_loc(key)
   3894 if is_integer(indexer):
   3895     indexer = [indexer]

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/base.py:3798, in Index.get_loc(self, key)
   3793     if isinstance(casted_key, slice) or (
   3794         isinstance(casted_key, abc.Iterable)
   3795         and any(isinstance(x, slice) for x in casted_key)
   3796     ):
   3797         raise InvalidIndexError(key)
-> 3798     raise KeyError(key) from err
   3799 except TypeError:
   3800     # If we have a listlike key, _check_indexing_error will raise
   3801     #  InvalidIndexError. Otherwise we fall through and re-raise
   3802     #  the TypeError.
   3803     self._check_indexing_error(key)

KeyError: False

emissions[high_emissions]["Population"]

City
New York    1500
Beijing     2000
Name: Population, dtype: int64

emissions.loc[high_emissions, "Population"]

City
New York    1500
Beijing     2000
Name: Population, dtype: int64

emissions

emissions.loc[high_emissions, "Country":"Emissions"]

emissions.loc[:, ["Country", "Emissions"]]

emissions[["Country", "Emissions"]]

emissions.loc["Paris", "Country"]

'France'

emissions.loc[["Paris"], ["Country", "Population"]]

staff.loc["Iris", "Hours"]

15

def largest_earthquake_place(path):
    """
    Returns the name of the place with the largest-magnitude earthquake in the specified CSV file.

    >>> largest_earthquake_place("earthquakes.csv")
    'Northern Mariana Islands'
    """
    earthquakes = pd.read_csv(path).to_dict("records")

    max_name = None
    max_magn = None
    for earthquake in earthquakes:
        if max_magn is None or earthquake["magnitude"] > max_magn:
            max_name = earthquake["name"]
            max_magn = earthquake["magnitude"]
    return max_name


doctest.run_docstring_examples(largest_earthquake_place, globals())

def largest_earthquake_place(path):
    """
    Returns the name of the place with the largest-magnitude earthquake in the specified CSV file.

    >>> largest_earthquake_place("earthquakes.csv")
    'Northern Mariana Islands'
    """
    earthquakes = pd.read_csv(path, index_col="id")
    display(earthquakes) # Helpful for debugging: delete when done
    return earthquakes.loc[earthquakes["magnitude"].idxmax(), "name"]


doctest.run_docstring_examples(largest_earthquake_place, globals())

emissions.iloc[:, -2:]

	id	year	month	day	latitude	longitude	name	magnitude
0	nc72666881	2016	7	27	37.672333	-121.619000	California	1.43
1	us20006i0y	2016	7	27	21.514600	94.572100	Burma	4.90
2	nc72666891	2016	7	27	37.576500	-118.859167	California	0.06
3	nc72666896	2016	7	27	37.595833	-118.994833	California	0.40
4	nn00553447	2016	7	27	39.377500	-119.845000	Nevada	0.30
...	...	...	...	...	...	...	...	...
8389	nc72685246	2016	8	25	36.515499	-121.099831	California	2.42
8390	ak13879193	2016	8	25	61.498400	-149.862700	Alaska	1.40
8391	nc72685251	2016	8	25	38.805000	-122.821503	California	1.06
8392	ci37672328	2016	8	25	34.308000	-118.635333	California	1.55
8393	ci37672360	2016	8	25	34.119167	-116.933667	California	0.89

	year	month	day	latitude	longitude	name	magnitude
id
nc72666881	2016	7	27	37.672333	-121.619000	California	1.43
us20006i0y	2016	7	27	21.514600	94.572100	Burma	4.90
nc72666891	2016	7	27	37.576500	-118.859167	California	0.06
nc72666896	2016	7	27	37.595833	-118.994833	California	0.40
nn00553447	2016	7	27	39.377500	-119.845000	Nevada	0.30
...	...	...	...	...	...	...	...
nc72685246	2016	8	25	36.515499	-121.099831	California	2.42
ak13879193	2016	8	25	61.498400	-149.862700	Alaska	1.40
nc72685251	2016	8	25	38.805000	-122.821503	California	1.06
ci37672328	2016	8	25	34.308000	-118.635333	California	1.55
ci37672360	2016	8	25	34.119167	-116.933667	California	0.89

Data Frames¶

Import statements¶

Creating a Data Frame¶

Column indexers¶

Element-wise operations¶

Row indexers¶

Selection by label¶

Practice: Largest earthquake place (Pandas)¶

Optional: Selection by position¶

	Country	Emissions	Population
City
New York	USA	200	1500
Paris	France	48	42
Beijing	China	300	2000
Nice	France	40	60
Seattle	USA	100	1000