import doctest
import io
import pandas as pd

import doctest
doctest.testmod()

from doctest import testmod
testmod()

TestResults(failed=0, attempted=0)

csv = """
Name,Hours
Anna,20
Iris,15
Abiy,10
Gege,12
"""

staff = pd.read_csv(io.StringIO(csv))
staff

staff.to_dict("records")

[{'Name': 'Anna', 'Hours': 20},
 {'Name': 'Iris', 'Hours': 15},
 {'Name': 'Abiy', 'Hours': 10},
 {'Name': 'Gege', 'Hours': 12}]

staff = staff.set_index("Name")
staff

type(staff)

pandas.core.frame.DataFrame

staff.loc["Iris"]

Hours    15
Name: Iris, dtype: int64

staff.index

Index(['Anna', 'Iris', 'Abiy', 'Gege'], dtype='object', name='Name')

type(staff.index)

pandas.core.indexes.base.Index

staff["Hours"]

Name
Anna    20
Iris    15
Abiy    10
Gege    12
Name: Hours, dtype: int64

type(staff["Hours"])

pandas.core.series.Series

staff["Hours"].describe()

count     4.000000
mean     14.250000
std       4.349329
min      10.000000
25%      11.500000
50%      13.500000
75%      16.250000
max      20.000000
Name: Hours, dtype: float64

staff["Hours"].min()

10

staff["Hours"]["Iris"]

15

staff.loc["Iris"]

Hours    15
Name: Iris, dtype: int64

staff.loc["Iris"]["Hours"]

15

staff["Hours"].max() - staff["Hours"].min()

10

staff.max() - staff.min()

Hours    10
dtype: int64

type(staff.max() - staff.min())

pandas.core.series.Series

staff.max()

Hours    20
dtype: int64

csv = """
City,Country,Emissions,Population
New York,USA,200,1500
Paris,France,48,42
Beijing,China,300,2000
Nice,France,40,60
Seattle,USA,100,1000
"""

emissions = pd.read_csv(io.StringIO(csv), index_col="City")
# This is like writing set_index("City")
emissions

emissions["Emissions"] / emissions["Population"]

City
New York    0.133333
Paris       1.142857
Beijing     0.150000
Nice        0.666667
Seattle     0.100000
dtype: float64

emissions["Population"] + 4

City
New York    1504
Paris         46
Beijing     2004
Nice          64
Seattle     1004
Name: Population, dtype: int64

emissions

emissions["Population"] = emissions["Population"] + 4
emissions

high_emissions = emissions["Emissions"] >= 200
emissions[high_emissions]

high_emissions

City
New York     True
Paris       False
Beijing      True
Nice        False
Seattle     False
Name: Emissions, dtype: bool

emissions[~high_emissions & (emissions["Country"] == "USA")]

emissions[(emissions["Country"] == "France") & (emissions["Population"] > 50)]

emissions[emissions["Country"] == "France"] and emissions[emissions["Population"] > 50]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_347/3398251716.py in ?()
----> 1 emissions[emissions["Country"] == "France"] and emissions[emissions["Population"] > 50]

/opt/conda/lib/python3.11/site-packages/pandas/core/generic.py in ?(self)
   1575     @final
   1576     def __nonzero__(self) -> NoReturn:
-> 1577         raise ValueError(
   1578             f"The truth value of a {type(self).__name__} is ambiguous. "
   1579             "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
   1580         )

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

emissions["Country"] == "France" & emissions["Population"] > 50

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:362, in na_logical_op(x, y, op)
    353 try:
    354     # For exposition, write:
    355     #  yarr = isinstance(y, np.ndarray)
   (...)
    360     # Then Cases where this goes through without raising include:
    361     #  (xint or xbool) and (yint or bool)
--> 362     result = op(x, y)
    363 except TypeError:

File /opt/conda/lib/python3.11/site-packages/pandas/core/roperator.py:54, in rand_(left, right)
     53 def rand_(left, right):
---> 54     return operator.and_(right, left)

TypeError: ufunc 'bitwise_and' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:376, in na_logical_op(x, y, op)
    375 try:
--> 376     result = libops.scalar_binop(x, y, op)
    377 except (
    378     TypeError,
    379     ValueError,
   (...)
    382     NotImplementedError,
    383 ) as err:

File ops.pyx:180, in pandas._libs.ops.scalar_binop()

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[46], line 1
----> 1 emissions["Country"] == "France" & emissions["Population"] > 50

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/common.py:76, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
     72             return NotImplemented
     74 other = item_from_zerodim(other)
---> 76 return method(self, other)

File /opt/conda/lib/python3.11/site-packages/pandas/core/arraylike.py:74, in OpsMixin.__rand__(self, other)
     72 @unpack_zerodim_and_defer("__rand__")
     73 def __rand__(self, other):
---> 74     return self._logical_method(other, roperator.rand_)

File /opt/conda/lib/python3.11/site-packages/pandas/core/series.py:6130, in Series._logical_method(self, other, op)
   6127 lvalues = self._values
   6128 rvalues = extract_array(other, extract_numpy=True, extract_range=True)
-> 6130 res_values = ops.logical_op(lvalues, rvalues, op)
   6131 return self._construct_result(res_values, name=res_name)

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:454, in logical_op(left, right, op)
    450 else:
    451     # i.e. scalar
    452     is_other_int_dtype = lib.is_integer(rvalues)
--> 454 res_values = na_logical_op(lvalues, rvalues, op)
    456 # For int vs int `^`, `|`, `&` are bitwise operators and return
    457 #   integer dtypes.  Otherwise these are boolean ops
    458 if not (left.dtype.kind in "iu" and is_other_int_dtype):

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:385, in na_logical_op(x, y, op)
    377         except (
    378             TypeError,
    379             ValueError,
   (...)
    382             NotImplementedError,
    383         ) as err:
    384             typ = type(y).__name__
--> 385             raise TypeError(
    386                 f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array "
    387                 f"and scalar of type [{typ}]"
    388             ) from err
    390 return result.reshape(x.shape)

TypeError: Cannot perform 'rand_' with a dtyped [int64] array and scalar of type [bool]

(emissions["Country"] == "France") & (emissions["Population"] > 50)

City
New York    False
Paris       False
Beijing     False
Nice         True
Seattle     False
dtype: bool

emissions["Country"] == ("France" & emissions["Population"]) > 50

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:362, in na_logical_op(x, y, op)
    353 try:
    354     # For exposition, write:
    355     #  yarr = isinstance(y, np.ndarray)
   (...)
    360     # Then Cases where this goes through without raising include:
    361     #  (xint or xbool) and (yint or bool)
--> 362     result = op(x, y)
    363 except TypeError:

File /opt/conda/lib/python3.11/site-packages/pandas/core/roperator.py:54, in rand_(left, right)
     53 def rand_(left, right):
---> 54     return operator.and_(right, left)

TypeError: ufunc 'bitwise_and' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:376, in na_logical_op(x, y, op)
    375 try:
--> 376     result = libops.scalar_binop(x, y, op)
    377 except (
    378     TypeError,
    379     ValueError,
   (...)
    382     NotImplementedError,
    383 ) as err:

File ops.pyx:180, in pandas._libs.ops.scalar_binop()

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[48], line 1
----> 1 emissions["Country"] == ("France" & emissions["Population"]) > 50

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/common.py:76, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
     72             return NotImplemented
     74 other = item_from_zerodim(other)
---> 76 return method(self, other)

File /opt/conda/lib/python3.11/site-packages/pandas/core/arraylike.py:74, in OpsMixin.__rand__(self, other)
     72 @unpack_zerodim_and_defer("__rand__")
     73 def __rand__(self, other):
---> 74     return self._logical_method(other, roperator.rand_)

File /opt/conda/lib/python3.11/site-packages/pandas/core/series.py:6130, in Series._logical_method(self, other, op)
   6127 lvalues = self._values
   6128 rvalues = extract_array(other, extract_numpy=True, extract_range=True)
-> 6130 res_values = ops.logical_op(lvalues, rvalues, op)
   6131 return self._construct_result(res_values, name=res_name)

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:454, in logical_op(left, right, op)
    450 else:
    451     # i.e. scalar
    452     is_other_int_dtype = lib.is_integer(rvalues)
--> 454 res_values = na_logical_op(lvalues, rvalues, op)
    456 # For int vs int `^`, `|`, `&` are bitwise operators and return
    457 #   integer dtypes.  Otherwise these are boolean ops
    458 if not (left.dtype.kind in "iu" and is_other_int_dtype):

File /opt/conda/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:385, in na_logical_op(x, y, op)
    377         except (
    378             TypeError,
    379             ValueError,
   (...)
    382             NotImplementedError,
    383         ) as err:
    384             typ = type(y).__name__
--> 385             raise TypeError(
    386                 f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array "
    387                 f"and scalar of type [{typ}]"
    388             ) from err
    390 return result.reshape(x.shape)

TypeError: Cannot perform 'rand_' with a dtyped [int64] array and scalar of type [bool]

staff.loc["Iris"]["Hours"]

15

staff.loc["Iris", "Hours"]

15

emissions.loc[high_emissions, "Population"]

City
New York    1504
Beijing     2004
Name: Population, dtype: int64

emissions.loc[~high_emissions & (emissions["Country"] == "USA"), "Population"]

City
Seattle    1004
Name: Population, dtype: int64

emissions.loc[high_emissions, "Country":"Population"]

emissions.loc[:, ["Country", "Emissions"]]

emissions.loc["Paris", ["Country", "Population"]]

Country       France
Population        46
Name: Paris, dtype: object

staff.loc["Iris", "Hours"]

15

def largest_earthquake_place(path):
    """
    Returns the name of the place with the largest-magnitude earthquake in the specified CSV file.

    >>> largest_earthquake_place("earthquakes.csv")
    'Northern Mariana Islands'
    """
    earthquakes = pd.read_csv(path).to_dict("records")
    # List of dictionaries format

    max_name = None
    max_magn = None
    for earthquake in earthquakes:
        if max_magn is None or earthquake["magnitude"] > max_magn:
            max_name = earthquake["name"]
            max_magn = earthquake["magnitude"]
    return max_name


doctest.run_docstring_examples(largest_earthquake_place, globals())

def largest_earthquake_place(path):
    """
    Returns the name of the place with the largest-magnitude earthquake in the specified CSV file.

    >>> largest_earthquake_place("earthquakes.csv")
    'Northern Mariana Islands'
    """
    earthquakes = pd.read_csv(path, index_col="id")
    return earthquakes.loc[earthquakes["magnitude"].idxmax(), "name"]


doctest.run_docstring_examples(largest_earthquake_place, globals())

emissions.iloc[:, -2:]

Data Frames¶

Import statements¶

Creating a Data Frame¶

Column indexers¶

Element-wise operations¶

Row indexers¶

Selection by label¶

Practice: Largest earthquake place (Pandas)¶

Optional: Selection by position¶

	Country	Emissions	Population
City
New York	USA	200	1500
Paris	France	48	42
Beijing	China	300	2000
Nice	France	40	60
Seattle	USA	100	1000

	Country	Emissions	Population
City
New York	USA	200	1504
Paris	France	48	46
Beijing	China	300	2004
Nice	France	40	64
Seattle	USA	100	1004