import doctest
import io
import pandas as pd # UW-IT JupyterHub specifically has this pre-installed

import doctest
doctest.testmod()

csv = """
Name,Hours
Anna,20
Iris,15
Abiy,10
Gege,12
"""

staff = pd.read_csv(io.StringIO(csv))
staff

staff = staff.set_index("Name")
staff

staff.index

Index(['Anna', 'Iris', 'Abiy', 'Gege'], dtype='object', name='Name')

staff["Hours"]

Name
Anna    20
Iris    15
Abiy    10
Gege    12
Name: Hours, dtype: int64

# Step 1: Picking out the column for "Hours"
# Step 2: Picking out the row for the name, "Iris" (index)
staff["Hours"]["Iris"]

15

# Error here is due to the first indexing operation looking for a column named "Iris"
staff["Iris"]["Hours"]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Iris'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[44], line 2
      1 # Error here is due to the first indexing operation looking for a column named "Iris"
----> 2 staff["Iris"]["Hours"]

File /opt/conda/lib/python3.11/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'Iris'

staff["Hours"].describe()

count     4.000000
mean     14.250000
std       4.349329
min      10.000000
25%      11.500000
50%      13.500000
75%      16.250000
max      20.000000
Name: Hours, dtype: float64

# What is the name of the person with the minimum number of hours?
staff["Hours"].idxmin()

'Abiy'

staff["Hours"]["Iris"]

15

# A lot of pandas code is going to be like this
staff["Hours"].max() - staff["Hours"].min()

10

csv = """
City,Country,Emissions,Population
New York,USA,200,1500
Paris,France,48,42
Beijing,China,300,2000
Nice,France,40,60
Seattle,USA,100,1000
"""

emissions = pd.read_csv(io.StringIO(csv), index_col="City")
emissions

emissions["Emissions"] / emissions["Population"]

City
New York    0.133333
Paris       1.142857
Beijing     0.150000
Nice        0.666667
Seattle     0.100000
dtype: float64

# DataFrame / Series
# Kevin doesn't know what to make of this?
emissions / emissions["Emissions"]

# Think about what am I performing the operations on: is it a single number, a Series, or a DataFrame

emissions["Population"] + 4

City
New York    1504
Paris         46
Beijing     2004
Nice          64
Seattle     1004
Name: Population, dtype: int64

emissions.sort_values("Emissions")

emissions.sort_values("Emissions", ascending=False)

high_emissions = emissions["Emissions"] >= 200
emissions[high_emissions]

high_emissions

City
New York     True
Paris       False
Beijing      True
Nice        False
Seattle     False
Name: Emissions, dtype: bool

emissions[high_emissions | (emissions["Country"] == "USA")]

(emissions["Country"] == "France") & (emissions["Population"] > 50)

City
New York    False
Paris       False
Beijing     False
Nice         True
Seattle     False
dtype: bool

emissions[(emissions["Country"] == "France") & (emissions["Population"] > 50)]

emissions[emissions["Country"] == ("France" & emissions["Population"]) > 50]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:362, in na_logical_op(x, y, op)
    353 try:
    354     # For exposition, write:
    355     #  yarr = isinstance(y, np.ndarray)
   (...)
    360     # Then Cases where this goes through without raising include:
    361     #  (xint or xbool) and (yint or bool)
--> 362     result = op(x, y)
    363 except TypeError:

File /opt/conda/lib/python3.11/site-packages/pandas/core/roperator.py:54, in rand_(left, right)
     53 def rand_(left, right):
---> 54     return operator.and_(right, left)

TypeError: ufunc 'bitwise_and' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:376, in na_logical_op(x, y, op)
    375 try:
--> 376     result = libops.scalar_binop(x, y, op)
    377 except (
    378     TypeError,
    379     ValueError,
   (...)
    382     NotImplementedError,
    383 ) as err:

File ops.pyx:180, in pandas._libs.ops.scalar_binop()

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 emissions[emissions["Country"] == "France" & emissions["Population"] > 50]

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/common.py:76, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
     72             return NotImplemented
     74 other = item_from_zerodim(other)
---> 76 return method(self, other)

File /opt/conda/lib/python3.11/site-packages/pandas/core/arraylike.py:74, in OpsMixin.__rand__(self, other)
     72 @unpack_zerodim_and_defer("__rand__")
     73 def __rand__(self, other):
---> 74     return self._logical_method(other, roperator.rand_)

File /opt/conda/lib/python3.11/site-packages/pandas/core/series.py:6130, in Series._logical_method(self, other, op)
   6127 lvalues = self._values
   6128 rvalues = extract_array(other, extract_numpy=True, extract_range=True)
-> 6130 res_values = ops.logical_op(lvalues, rvalues, op)
   6131 return self._construct_result(res_values, name=res_name)

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:454, in logical_op(left, right, op)
    450 else:
    451     # i.e. scalar
    452     is_other_int_dtype = lib.is_integer(rvalues)
--> 454 res_values = na_logical_op(lvalues, rvalues, op)
    456 # For int vs int `^`, `|`, `&` are bitwise operators and return
    457 #   integer dtypes.  Otherwise these are boolean ops
    458 if not (left.dtype.kind in "iu" and is_other_int_dtype):

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:385, in na_logical_op(x, y, op)
    377         except (
    378             TypeError,
    379             ValueError,
   (...)
    382             NotImplementedError,
    383         ) as err:
    384             typ = type(y).__name__
--> 385             raise TypeError(
    386                 f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array "
    387                 f"and scalar of type [{typ}]"
    388             ) from err
    390 return result.reshape(x.shape)

TypeError: Cannot perform 'rand_' with a dtyped [int64] array and scalar of type [bool]

# Is a DataFrame that has rows truth-y or false-y?
emissions[emissions["Country"] == "France"] and emissions[emissions["Population"] > 50]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_196/1013328691.py in ?()
      1 # Is a DataFrame that has rows truth-y or false-y?
----> 2 emissions[emissions["Country"] == "France"] and emissions[emissions["Population"] > 50]

/opt/conda/lib/python3.11/site-packages/pandas/core/generic.py in ?(self)
   1575     @final
   1576     def __nonzero__(self) -> NoReturn:
-> 1577         raise ValueError(
   1578             f"The truth value of a {type(self).__name__} is ambiguous. "
   1579             "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
   1580         )

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

emissions[emissions["Country"] == "France"] & emissions[emissions["Population"] > 50]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:362, in na_logical_op(x, y, op)
    353 try:
    354     # For exposition, write:
    355     #  yarr = isinstance(y, np.ndarray)
   (...)
    360     # Then Cases where this goes through without raising include:
    361     #  (xint or xbool) and (yint or bool)
--> 362     result = op(x, y)
    363 except TypeError:

TypeError: unsupported operand type(s) for &: 'float' and 'str'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
Cell In[30], line 1
----> 1 emissions[emissions["Country"] == "France"] & emissions[emissions["Population"] > 50]

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/common.py:76, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
     72             return NotImplemented
     74 other = item_from_zerodim(other)
---> 76 return method(self, other)

File /opt/conda/lib/python3.11/site-packages/pandas/core/arraylike.py:70, in OpsMixin.__and__(self, other)
     68 @unpack_zerodim_and_defer("__and__")
     69 def __and__(self, other):
---> 70     return self._logical_method(other, operator.and_)

File /opt/conda/lib/python3.11/site-packages/pandas/core/frame.py:7913, in DataFrame._arith_method(self, other, op)
   7910 self, other = self._align_for_op(other, axis, flex=True, level=None)
   7912 with np.errstate(all="ignore"):
-> 7913     new_data = self._dispatch_frame_op(other, op, axis=axis)
   7914 return self._construct_result(new_data)

File /opt/conda/lib/python3.11/site-packages/pandas/core/frame.py:7956, in DataFrame._dispatch_frame_op(self, right, func, axis)
   7950     assert self.columns.equals(right.columns)
   7951     # TODO: The previous assertion `assert right._indexed_same(self)`
   7952     #  fails in cases with empty columns reached via
   7953     #  _frame_arith_method_with_reindex
   7954 
   7955     # TODO operate_blockwise expects a manager of the same type
-> 7956     bm = self._mgr.operate_blockwise(
   7957         # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
   7958         # incompatible type "Union[ArrayManager, BlockManager]"; expected
   7959         # "ArrayManager"
   7960         # error: Argument 1 to "operate_blockwise" of "BlockManager" has
   7961         # incompatible type "Union[ArrayManager, BlockManager]"; expected
   7962         # "BlockManager"
   7963         right._mgr,  # type: ignore[arg-type]
   7964         array_op,
   7965     )
   7966     return self._constructor_from_mgr(bm, axes=bm.axes)
   7968 elif isinstance(right, Series) and axis == 1:
   7969     # axis=1 means we want to operate row-by-row

File /opt/conda/lib/python3.11/site-packages/pandas/core/internals/managers.py:1511, in BlockManager.operate_blockwise(self, other, array_op)
   1507 def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:
   1508     """
   1509     Apply array_op blockwise with another (aligned) BlockManager.
   1510     """
-> 1511     return operate_blockwise(self, other, array_op)

File /opt/conda/lib/python3.11/site-packages/pandas/core/internals/ops.py:65, in operate_blockwise(left, right, array_op)
     63 res_blks: list[Block] = []
     64 for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right):
---> 65     res_values = array_op(lvals, rvals)
     66     if (
     67         left_ea
     68         and not right_ea
     69         and hasattr(res_values, "reshape")
     70         and not is_1d_only_ea_dtype(res_values.dtype)
     71     ):
     72         res_values = res_values.reshape(1, -1)

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:454, in logical_op(left, right, op)
    450 else:
    451     # i.e. scalar
    452     is_other_int_dtype = lib.is_integer(rvalues)
--> 454 res_values = na_logical_op(lvalues, rvalues, op)
    456 # For int vs int `^`, `|`, `&` are bitwise operators and return
    457 #   integer dtypes.  Otherwise these are boolean ops
    458 if not (left.dtype.kind in "iu" and is_other_int_dtype):

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:369, in na_logical_op(x, y, op)
    367     x = ensure_object(x)
    368     y = ensure_object(y)
--> 369     result = libops.vec_binop(x.ravel(), y.ravel(), op)
    370 else:
    371     # let null fall thru
    372     assert lib.is_scalar(y)

File ops.pyx:252, in pandas._libs.ops.vec_binop()

File ops.pyx:245, in pandas._libs.ops.vec_binop()

TypeError: unsupported operand type(s) for &: 'str' and 'str'

emissions[high_emissions]["Population"]

City
New York    1500
Beijing     2000
Name: Population, dtype: int64

# Result is a Series because I may have multiple rows in the result, but only one column
emissions.loc[high_emissions, "Population"]

City
New York    1500
Beijing     2000
Name: Population, dtype: int64

emissions.loc[high_emissions, "Country":"Population"]

emissions.loc[:, ["Country", "Emissions"]]

emissions.loc[:, ["Country"]]

emissions.loc[:, "Country"]

City
New York       USA
Paris       France
Beijing      China
Nice        France
Seattle        USA
Name: Country, dtype: object

emissions.loc["Paris", "Country"]

'France'

staff.loc["Iris", "Hours"]

15

staff.loc["Hours", "Iris"]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Iris'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[45], line 1
----> 1 staff.loc["Hours", "Iris"]

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexing.py:1183, in _LocationIndexer.__getitem__(self, key)
   1181     key = tuple(com.apply_if_callable(x, self.obj) for x in key)
   1182     if self._is_scalar_access(key):
-> 1183         return self.obj._get_value(*key, takeable=self._takeable)
   1184     return self._getitem_tuple(key)
   1185 else:
   1186     # we by definition only have the 0th axis

File /opt/conda/lib/python3.11/site-packages/pandas/core/frame.py:4214, in DataFrame._get_value(self, index, col, takeable)
   4211     series = self._ixs(col, axis=1)
   4212     return series._values[index]
-> 4214 series = self._get_item_cache(col)
   4215 engine = self.index._engine
   4217 if not isinstance(self.index, MultiIndex):
   4218     # CategoricalIndex: Trying to use the engine fastpath may give incorrect
   4219     #  results if our categories are integers that dont match our codes
   4220     # IntervalIndex: IntervalTree has no get_loc

File /opt/conda/lib/python3.11/site-packages/pandas/core/frame.py:4638, in DataFrame._get_item_cache(self, item)
   4633 res = cache.get(item)
   4634 if res is None:
   4635     # All places that call _get_item_cache have unique columns,
   4636     #  pending resolution of GH#33047
-> 4638     loc = self.columns.get_loc(item)
   4639     res = self._ixs(loc, axis=1)
   4641     cache[item] = res

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'Iris'

staff

staff["Name"]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Name'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[49], line 1
----> 1 staff["Name"]

File /opt/conda/lib/python3.11/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'Name'

# Sorry option C choosers!
# Ends up being a single-row series rather than an individual value (15)
staff[staff.index == "Iris"]["Hours"]

Name
Iris    15
Name: Hours, dtype: int64

staff.loc["Iris", "Hours"]
# Better than writing staff["Hours"]["Iris"]

15

def largest_earthquake_place(path):
    """
    Returns the name of the place with the largest-magnitude earthquake in the specified CSV file.

    >>> largest_earthquake_place("earthquakes.csv")
    'Northern Mariana Islands'
    """
    earthquakes = pd.read_csv(path).to_dict("records")

    max_name = None
    max_magn = 0.0
    for earthquake in earthquakes:
        if earthquake["magnitude"] > max_magn:
            max_name = earthquake["name"]
            max_magn = earthquake["magnitude"]
    return max_name


doctest.run_docstring_examples(largest_earthquake_place, globals())

def largest_earthquake_place(path):
    """
    Returns the name of the place with the largest-magnitude earthquake in the specified CSV file.

    >>> largest_earthquake_place("earthquakes.csv")
    'Northern Mariana Islands'
    """
    earthquakes = pd.read_csv(path, index_col="id")
    display(earthquakes) # Helpful for debugging: delete when done
    return earthquakes.loc[earthquakes["magnitude"].idxmax(), "name"]


doctest.run_docstring_examples(largest_earthquake_place, globals())

earthquakes = pd.read_csv("earthquakes.csv", index_col="id")

earthquakes.idxmax()["magnitude"] # idxmax for all the columns, then select the idxmax of magnitude

'us100068jg'

earthquakes["magnitude"].idxmax() # idxmax for just the magnitude column

'us100068jg'

emissions.iloc[:, -2:]

	Beijing	Country	Emissions	New York	Nice	Paris	Population	Seattle
City
New York	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Paris	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Beijing	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Nice	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Seattle	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	year	month	day	latitude	longitude	name	magnitude
id
nc72666881	2016	7	27	37.672333	-121.619000	California	1.43
us20006i0y	2016	7	27	21.514600	94.572100	Burma	4.90
nc72666891	2016	7	27	37.576500	-118.859167	California	0.06
nc72666896	2016	7	27	37.595833	-118.994833	California	0.40
nn00553447	2016	7	27	39.377500	-119.845000	Nevada	0.30
...	...	...	...	...	...	...	...
nc72685246	2016	8	25	36.515499	-121.099831	California	2.42
ak13879193	2016	8	25	61.498400	-149.862700	Alaska	1.40
nc72685251	2016	8	25	38.805000	-122.821503	California	1.06
ci37672328	2016	8	25	34.308000	-118.635333	California	1.55
ci37672360	2016	8	25	34.119167	-116.933667	California	0.89

Data Frames¶

Import statements¶

Creating a Data Frame¶

Column indexers¶

Element-wise operations¶

Row indexers¶

Selection by label¶

Practice: Largest earthquake place (Pandas)¶

Optional: Selection by position¶

	Country	Emissions	Population
City
New York	USA	200	1500
Paris	France	48	42
Beijing	China	300	2000
Nice	France	40	60
Seattle	USA	100	1000