import doctest
import io
import pandas as pd

import doctest
doctest.testmod()

csv = """
Name,Hours
Diana,10
Diana,11
Thrisha,15
Yuxiang,20
Sheamin,12
"""

staff = pd.read_csv(io.StringIO(csv))
staff

staff["Name"]

0      Diana
1    Thrisha
2    Yuxiang
3    Sheamin
Name: Name, dtype: object

staff = staff.set_index("Name")
staff

staff.index

Index(['Diana', 'Diana', 'Thrisha', 'Yuxiang', 'Sheamin'], dtype='object', name='Name')

staff.columns

Index(['Hours'], dtype='object')

staff["Hours"]

Name
Diana      10
Diana      11
Thrisha    15
Yuxiang    20
Sheamin    12
Name: Hours, dtype: int64

type(staff["Hours"])

pandas.core.series.Series

staff["Hours"].describe()

count     5.000000
mean     13.600000
std       4.037326
min      10.000000
25%      11.000000
50%      12.000000
75%      15.000000
max      20.000000
Name: Hours, dtype: float64

staff["Hours"]["Thrisha"]

15

staff["Hours"].max() - staff["Hours"].min()

10

csv = """
City,Country,Emissions,Population
New York,USA,200,1500
Paris,France,48,42
Beijing,China,300,2000
Nice,France,40,60
Seattle,USA,100,1000
"""

emissions = pd.read_csv(io.StringIO(csv), index_col="City")
emissions

emissions = pd.read_csv(io.StringIO(csv))
emissions = emissions.set_index("City")
emissions

emissions["Emissions"] / emissions["Population"]

City
New York    0.133333
Paris       1.142857
Beijing     0.150000
Nice        0.666667
Seattle     0.100000
dtype: float64

emissions["Emissions per Capita"] = emissions["Emissions"] / emissions["Population"]
emissions

emissions["Population"] + 4 # this is not going to change emissions["Population"]

City
New York    1504
Paris         46
Beijing     2004
Nice          64
Seattle     1004
Name: Population, dtype: int64

high_emissions = emissions["Emissions"] >= 200
emissions[high_emissions]

high_emissions

City
New York     True
Paris       False
Beijing      True
Nice        False
Seattle     False
Name: Emissions, dtype: bool

high_emissions

City
New York     True
Paris       False
Beijing      True
Nice        False
Seattle     False
Name: Emissions, dtype: bool

emissions[high_emissions | (emissions["Country"] == "USA")]

high_emissions | (emissions["Country"] == "USA")

City
New York     True
Paris       False
Beijing      True
Nice        False
Seattle      True
dtype: bool

# Poll Question time!
emissions[(emissions["Country"] == "France") & (emissions["Population"] > 50)]

emissions.loc[high_emissions, "Population"]

City
New York    1500
Beijing     2000
Name: Population, dtype: int64

emissions.loc[high_emissions, "Country":"Population"]

type(emissions)

pandas.core.frame.DataFrame

emissions.loc[:, ["Country", "Emissions"]]

emissions.loc["Paris", "Country"]

'France'

staff.loc["Thrisha", "Hours"]

15

def largest_earthquake_place(path):
    """
    Returns the name of the place with the largest-magnitude earthquake in the specified CSV file.

    >>> largest_earthquake_place("earthquakes.csv")
    'Northern Mariana Islands'
    """
    earthquakes = []
    with open(path) as f:
        import csv
        reader = csv.DictReader(f)
        for row in reader:
            earthquakes.append(row)

    maxmag_earthquake = None
    for earthquake in earthquakes:
        if maxmag_earthquake is None or earthquake["magnitude"] > maxmag_earthquake["magnitude"]:
            maxmag_earthquake = earthquake
    return maxmag_earthquake["name"]

doctest.run_docstring_examples(largest_earthquake_place, globals())

earthquakes = pd.read_csv("earthquakes.csv", index_col="id")
display(earthquakes) # Helpful for debugging

earthquakes["magnitude"].idxmax()

'us100068jg'

earthquakes.loc[earthquakes["magnitude"].idxmax(), "name"]

'Northern Mariana Islands'

def largest_earthquake_place(path):
    """
    Returns the name of the place with the largest-magnitude earthquake in the specified CSV file.

    >>> largest_earthquake_place("earthquakes.csv")
    'Northern Mariana Islands'
    """
    earthquakes = pd.read_csv(path, index_col="id")
    return earthquakes.loc[earthquakes["magnitude"].idxmax(), "name"]

doctest.run_docstring_examples(largest_earthquake_place, globals())

# emissions.iloc[:, -2:]
# emissions.iloc[:, -2:-1]
emissions.iloc[:, -2]

City
New York    1500
Paris         42
Beijing     2000
Nice          60
Seattle     1000
Name: Population, dtype: int64

	year	month	day	latitude	longitude	name	magnitude
id
nc72666881	2016	7	27	37.672333	-121.619000	California	1.43
us20006i0y	2016	7	27	21.514600	94.572100	Burma	4.90
nc72666891	2016	7	27	37.576500	-118.859167	California	0.06
nc72666896	2016	7	27	37.595833	-118.994833	California	0.40
nn00553447	2016	7	27	39.377500	-119.845000	Nevada	0.30
...	...	...	...	...	...	...	...
nc72685246	2016	8	25	36.515499	-121.099831	California	2.42
ak13879193	2016	8	25	61.498400	-149.862700	Alaska	1.40
nc72685251	2016	8	25	38.805000	-122.821503	California	1.06
ci37672328	2016	8	25	34.308000	-118.635333	California	1.55
ci37672360	2016	8	25	34.119167	-116.933667	California	0.89

Data Frames¶

Import statements¶

Creating a Data Frame¶

Column indexers¶

Element-wise operations¶

Row indexers¶

Selection by label¶

Practice: Largest earthquake place (Pandas)¶

Optional: Selection by position¶

	Country	Emissions	Population
City
New York	USA	200	1500
Paris	France	48	42
Beijing	China	300	2000
Nice	France	40	60
Seattle	USA	100	1000