import doctest
import io
import pandas as pd

csv = """
id,year,month,day,latitude,longitude,name,magnitude
nc72666881,2016,7,27,37.6723333,-121.619,California,1.43
us20006i0y,2016,7,27,21.5146,94.5721,Burma,4.9
nc72666891,2016,7,27,37.5765,-118.85916670000002,California,0.06
nc72666896,2016,7,27,37.5958333,-118.99483329999998,California,0.4
nn00553447,2016,7,27,39.3775,-119.845,Nevada,0.3
ak13805337,2016,7,27,61.2963,-152.46,Alaska,1.8
hv61354276,2016,7,27,19.4235,-155.60983330000005,Hawaii,1.0
ak13805339,2016,7,27,61.3019,-152.4507,Alaska,2.0
ci37640584,2016,7,27,35.503,-118.40583329999998,California,1.2
nc72666901,2016,7,27,37.673,-121.6133333,California,1.67
ci37640592,2016,7,27,33.5888333,-116.8165,California,0.48
nn00553416,2016,7,27,38.2638,-118.7351,Nevada,0.9
"""

earthquakes = pd.read_csv(io.StringIO(csv), index_col="id")
earthquakes

# Create a dictionary of all the place names, where each place is associated with the maximum magnitude number
max_mag_by_place = {}
for earthquake in earthquakes.to_dict("records"): # Convert to list of dictionaries
    if earthquake["name"] not in max_mag_by_place or earthquake["magnitude"] > max_mag_by_place[earthquake["name"]]:
        max_mag_by_place[earthquake["name"]] = earthquake["magnitude"]

max_mag_by_place

{'California': 1.67, 'Burma': 4.9, 'Nevada': 0.9, 'Alaska': 2.0, 'Hawaii': 1.0}

# What's the maximum magnitude earthquake across the entire dataset?
earthquakes["magnitude"].max()

4.9

# When you're writing Pandas code, probably don't write a for loop.
earthquakes.groupby("name")["magnitude"].max()

name
Alaska        2.00
Burma         4.90
California    1.67
Hawaii        1.00
Nevada        0.90
Name: magnitude, dtype: float64

earthquakes = pd.read_csv("earthquakes.csv", index_col="id")
earthquakes

magnitudes_per_day = earthquakes.groupby(["year", "month", "day"])["magnitude"].describe()
magnitudes_per_day

earthquakes.groupby(["day", "month", "year"])["magnitude"].describe()

# For each unique place name, what is the maximum latitude of any earthquake that occurred there?
earthquakes.groupby("name")["latitude"].max()

name
Afghanistan                       36.634500
Alaska                            70.778700
Anguilla                          18.435800
Argentina                        -22.394200
Arizona                           36.811667
                                    ...    
Washington                        48.965667
West Virginia                     37.863000
Western Indian-Antarctic Ridge   -49.281000
Western Xizang                    34.444600
Wyoming                           44.749000
Name: latitude, Length: 118, dtype: float64

# For each unique place name, what is the label for the earthquake with the max latitude that occurred there?
earthquakes.groupby("name")["latitude"].idxmax()

name
Afghanistan                       us10006djl
Alaska                            ak13841660
Anguilla                          pr16211003
Argentina                         us10006a1d
Arizona                           uu60159042
                                     ...    
Washington                        uw61189756
West Virginia                     se60029113
Western Indian-Antarctic Ridge    us10006cyi
Western Xizang                    us10006d57
Wyoming                           uu60157532
Name: latitude, Length: 118, dtype: object

magnitudes_per_day.index

MultiIndex([(2016, 7, 27),
            (2016, 7, 28),
            (2016, 7, 29),
            (2016, 7, 30),
            (2016, 7, 31),
            (2016, 8,  1),
            (2016, 8,  2),
            (2016, 8,  3),
            (2016, 8,  4),
            (2016, 8,  5),
            (2016, 8,  6),
            (2016, 8,  7),
            (2016, 8,  8),
            (2016, 8,  9),
            (2016, 8, 10),
            (2016, 8, 11),
            (2016, 8, 12),
            (2016, 8, 13),
            (2016, 8, 14),
            (2016, 8, 15),
            (2016, 8, 16),
            (2016, 8, 17),
            (2016, 8, 18),
            (2016, 8, 19),
            (2016, 8, 20),
            (2016, 8, 21),
            (2016, 8, 22),
            (2016, 8, 23),
            (2016, 8, 24),
            (2016, 8, 25)],
           names=['year', 'month', 'day'])

# How many earthquakes occurred on 2016-07-27?
# Is this a single value, a series, or a dataframe?
magnitudes_per_day.loc[(2016, 7, 27), "count"]

272.0

# Get only the count column from (all the rows) the magnitudes per day.
magnitudes_per_day.loc[:, "count"]

year  month  day
2016  7      27     272.0
             28     308.0
             29     309.0
             30     329.0
             31     278.0
      8      1      356.0
             2      292.0
             3      294.0
             4      420.0
             5      256.0
             6      316.0
             7      316.0
             8      335.0
             9      272.0
             10     329.0
             11     356.0
             12     326.0
             13     284.0
             14     231.0
             15     222.0
             16     223.0
             17     220.0
             18     219.0
             19     226.0
             20     237.0
             21     266.0
             22     215.0
             23     233.0
             24     216.0
             25     238.0
Name: count, dtype: float64

# How many earthquakes occurred between August 10-15, 2016?
# Will this include or exclude the endpoint?
# In Pandas, loc is label based! With label based operations, it's hard to always know what's the
#   next label after your endpoint. Therefore, Pandas should include the endpoint even when slicing.
# Will this be a single value, a series, or a dataframe?
magnitudes_per_day.loc[(2016, 8, 10:15), "count"]

# Python does not allow the colon for slicing inside the context of a tuple!

  Cell In[19], line 6
    magnitudes_per_day.loc[(2016, 8, 10:15), "count"]
                                       ^
SyntaxError: invalid syntax

(2016, 8, 10:15)

  Cell In[20], line 1
    (2016, 8, 10:15)
                ^
SyntaxError: invalid syntax

(2016, 8, slice(10, 15))

(2016, 8, slice(10, 15, None))

# How many earthquakes occurred between August 10-15, 2016?
# Here is the corrected version using the built-in slice function! Python should be happy!
magnitudes_per_day.loc[(2016, 8, slice(10, 15)), "count"]

year  month  day
2016  8      10     329.0
             11     356.0
             12     326.0
             13     284.0
             14     231.0
             15     222.0
Name: count, dtype: float64

# How many earthquakes occurred in August 2016?
magnitudes_per_day.loc[(2016, 8), "count"]

day
1     356.0
2     292.0
3     294.0
4     420.0
5     256.0
6     316.0
7     316.0
8     335.0
9     272.0
10    329.0
11    356.0
12    326.0
13    284.0
14    231.0
15    222.0
16    223.0
17    220.0
18    219.0
19    226.0
20    237.0
21    266.0
22    215.0
23    233.0
24    216.0
25    238.0
Name: count, dtype: float64

# How many earthquakes occurred in August 2016? (But keeping the year and month levels!)
magnitudes_per_day.loc[(2016, 8, slice()), "count"]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[24], line 2
      1 # How many earthquakes occurred in August 2016?
----> 2 magnitudes_per_day.loc[(2016, 8, slice()), "count"]

TypeError: slice expected at least 1 argument, got 0

# How many earthquakes occurred in August 2016? (But keeping the year and month levels!)
magnitudes_per_day.loc[(2016, 8, slice(None)), "count"]

year  month  day
2016  8      1      356.0
             2      292.0
             3      294.0
             4      420.0
             5      256.0
             6      316.0
             7      316.0
             8      335.0
             9      272.0
             10     329.0
             11     356.0
             12     326.0
             13     284.0
             14     231.0
             15     222.0
             16     223.0
             17     220.0
             18     219.0
             19     226.0
             20     237.0
             21     266.0
             22     215.0
             23     233.0
             24     216.0
             25     238.0
Name: count, dtype: float64

# How many earthquakes occurred on these two dates?
magnitudes_per_day.loc[[(2016, 8, 1), (2016, 8, 15)], "count"]

year  month  day
2016  8      1      356.0
             15     222.0
Name: count, dtype: float64

# On the days where there were fewer than 220 earthquakes, how many earthquakes were there?
magnitudes_per_day.loc[magnitudes_per_day["count"] < 220, "count"]

year  month  day
2016  8      18     219.0
             22     215.0
             24     216.0
Name: count, dtype: float64

magnitudes_per_day.loc[(2016, 8, slice(10)), "count"]

year  month  day
2016  8      1      356.0
             2      292.0
             3      294.0
             4      420.0
             5      256.0
             6      316.0
             7      316.0
             8      335.0
             9      272.0
             10     329.0
Name: count, dtype: float64

magnitudes_per_day.loc[(2016, 8, slice(10, None)), "count"]

year  month  day
2016  8      10     329.0
             11     356.0
             12     326.0
             13     284.0
             14     231.0
             15     222.0
             16     223.0
             17     220.0
             18     219.0
             19     226.0
             20     237.0
             21     266.0
             22     215.0
             23     233.0
             24     216.0
             25     238.0
Name: count, dtype: float64

magnitudes_per_day.loc[(2016, slice(7, 8), slice(10, None)), "count"]

year  month  day
2016  7      27     272.0
             28     308.0
             29     309.0
             30     329.0
             31     278.0
      8      10     329.0
             11     356.0
             12     326.0
             13     284.0
             14     231.0
             15     222.0
             16     223.0
             17     220.0
             18     219.0
             19     226.0
             20     237.0
             21     266.0
             22     215.0
             23     233.0
             24     216.0
             25     238.0
Name: count, dtype: float64

magnitudes_per_day.loc[(2016, 8, 10:), "count"]

  Cell In[30], line 1
    magnitudes_per_day.loc[(2016, 8, 10:), "count"]
                                       ^
SyntaxError: invalid syntax

# Main problem is that we're passing too many arguments to loc[rows, columns]
magnitudes_per_day.loc[2016, 8, 10:, "count"]

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[31], line 1
----> 1 magnitudes_per_day.loc[2016, 8, 10:, "count"]

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexing.py:1184, in _LocationIndexer.__getitem__(self, key)
   1182     if self._is_scalar_access(key):
   1183         return self.obj._get_value(*key, takeable=self._takeable)
-> 1184     return self._getitem_tuple(key)
   1185 else:
   1186     # we by definition only have the 0th axis
   1187     axis = self.axis or 0

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexing.py:1368, in _LocIndexer._getitem_tuple(self, tup)
   1366 with suppress(IndexingError):
   1367     tup = self._expand_ellipsis(tup)
-> 1368     return self._getitem_lowerdim(tup)
   1370 # no multi-index, so validate all of the indexers
   1371 tup = self._validate_tuple_indexer(tup)

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexing.py:1041, in _LocationIndexer._getitem_lowerdim(self, tup)
   1039 # we may have a nested tuples indexer here
   1040 if self._is_nested_tuple_indexer(tup):
-> 1041     return self._getitem_nested_tuple(tup)
   1043 # we maybe be using a tuple to represent multiple dimensions here
   1044 ax0 = self.obj._get_axis(0)

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexing.py:1140, in _LocationIndexer._getitem_nested_tuple(self, tup)
   1137     # this is a series with a multi-index specified a tuple of
   1138     # selectors
   1139     axis = self.axis or 0
-> 1140     return self._getitem_axis(tup, axis=axis)
   1142 # handle the multi-axis by taking sections and reducing
   1143 # this is iterative
   1144 obj = self.obj

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexing.py:1424, in _LocIndexer._getitem_axis(self, key, axis)
   1422 # nested tuple slicing
   1423 if is_nested_tuple(key, labels):
-> 1424     locs = labels.get_locs(key)
   1425     indexer: list[slice | npt.NDArray[np.intp]] = [slice(None)] * self.ndim
   1426     indexer[axis] = locs

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/multi.py:3536, in MultiIndex.get_locs(self, seq)
   3532     continue
   3534 else:
   3535     # a slice or a single label
-> 3536     lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer)
   3538 # update indexer
   3539 lvl_indexer = _to_bool_indexer(lvl_indexer)

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/multi.py:3316, in MultiIndex._get_level_indexer(self, key, level, indexer)
   3308 def _get_level_indexer(
   3309     self, key, level: int = 0, indexer: npt.NDArray[np.bool_] | None = None
   3310 ):
   (...)
   3313     # in the totality of values
   3314     # if the indexer is provided, then use this
-> 3316     level_index = self.levels[level]
   3317     level_codes = self.codes[level]
   3319     def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
   3320         # Compute a bool indexer to identify the positions to take.
   3321         # If we have an existing indexer, we only need to examine the
   3322         # subset of positions where the existing indexer is True.

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/frozen.py:79, in FrozenList.__getitem__(self, n)
     77 if isinstance(n, slice):
     78     return type(self)(super().__getitem__(n))
---> 79 return super().__getitem__(n)

IndexError: list index out of range

ufos = pd.read_csv("ufos.csv", index_col="datetime")
ufos

ufos.groupby("shape")["duration (seconds)"].mean()

shape
changing     9.265600e+02
chevron      3.111250e+02
cigar        8.217407e+02
circle       6.804353e+02
cone         3.000000e+02
cross        6.000000e+01
cylinder     1.499556e+03
delta        1.440000e+04
diamond      1.140300e+03
disk         1.143329e+03
egg          3.088000e+03
fireball     3.467656e+02
flash        4.639091e+02
formation    1.913088e+03
light        1.122005e+03
other        9.259301e+04
oval         1.425136e+03
rectangle    2.285882e+02
sphere       1.506268e+06
teardrop     1.397143e+02
triangle     7.352900e+02
unknown      1.207963e+03
Name: duration (seconds), dtype: float64

"country" == "us"

False

ufos["country" == "us"]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: False

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[36], line 1
----> 1 ufos["country" == "us"]

File /opt/conda/lib/python3.11/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File /opt/conda/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: False

ufos[ufos["country"] == "us"].groupby("city")["duration (seconds)"].max()

city
acton                                   180.0
addison (i-355 and us 20 (lake st.)     600.0
albany                                  120.0
albuquerque                            3600.0
algona                                 3600.0
                                        ...  
wolfforth                               300.0
worcester                                 4.0
yakima                                  240.0
york                                     15.0
yuma                                    900.0
Name: duration (seconds), Length: 627, dtype: float64

ufos.groupby("city")["duration (seconds)"].count()

city
acton                                         1
addison (i-355 and us 20 (lake st.)           1
adelaide  (south australia)                   1
adelaide (pt. wakefield) (south australia)    1
albany                                        4
                                             ..
yangtzee river (china)                        1
yerevan (armenia)                             1
york                                          2
yuma                                          1
zlatoust (russia)                             1
Name: duration (seconds), Length: 794, dtype: int64

ufos.groupby("city")["state"].count().idxmax()

'seattle'

ufos

ufos["comments"].str.split()

datetime
10/10/1949 20:30    [This, event, took, place, in, early, fall, ar...
10/10/1949 21:00    [1949, Lackland, AFB&#44, TX., Lights, racing,...
10/10/1955 17:00    [Green/Orange, circular, disc, over, Chester&#...
10/10/1956 21:00    [My, older, brother, and, twin, sister, were, ...
10/10/1960 20:00    [AS, a, Marine, 1st, Lt., flying, an, FJ4B, fi...
                                          ...                        
10/12/1982 05:30    [Round, light, which, was, observed, going, in...
10/12/1985 23:45    [My, sister, and, I, observed, a, disk, for, a...
10/12/1988 16:30    [Large, cigar, shaped, craft&#44flying, sidewa...
10/12/1994 11:55    [I, had, just, picked, up, my, Daugther, (2yrs...
10/12/1994 15:00    [Triangular/chevron, small, object, with, fixe...
Name: comments, Length: 1001, dtype: object

ufos["comments"].str.split().str.len()

datetime
10/10/1949 20:30    24
10/10/1949 21:00    17
10/10/1955 17:00     6
10/10/1956 21:00    26
10/10/1960 20:00    25
                    ..
10/12/1982 05:30    18
10/12/1985 23:45    16
10/12/1988 16:30     8
10/12/1994 11:55    28
10/12/1994 15:00    10
Name: comments, Length: 1001, dtype: int64

ufos["word count"] = ufos["comments"].str.split().str.len()
ufos

def clean_city_name(s):
    """
    Returns all the characters in the given string with trailing parentheticals removed.

    >>> clean_city_name("seattle (ballard area)")
    'seattle'
    >>> clean_city_name("seattle (west)")
    'seattle'
    >>> clean_city_name("melbourne (vic&#44 australia)")
    'melbourne'
    >>> clean_city_name("chester (uk/england)")
    'chester'
    >>> clean_city_name("carrieres sous poissy (france)")
    'carrieres sous poissy'
    >>> clean_city_name("seattle")
    'seattle'
    """
    index = s.find("(")
    if index == -1:
        return s
    return s[:index].rstrip()


doctest.run_docstring_examples(clean_city_name, globals())

ufos["city"].apply(clean_city_name).value_counts()

city
new york city     12
seattle           12
tinley park        8
oak forest         8
las vegas          7
                  ..
holiday            1
eagan              1
siloam sprngs      1
canyonlands np     1
monticello         1
Name: count, Length: 774, dtype: int64

# Alternatively, we can use groupby on a Series.
# Kind of bizarre, but the argument to groupby is the Series itself, which Pandas uses to form groups
ufos["city"].apply(clean_city_name).groupby(ufos["city"].apply(clean_city_name)).count()

city
acton             1
addison           1
adelaide          2
albany            4
albuquerque       5
                 ..
yangtzee river    1
yerevan           1
york              2
yuma              1
zlatoust          1
Name: city, Length: 774, dtype: int64

# Alternatively, we can make a DataFrame selection and then map the function to each element in the single-column DataFrame
ufos[["city"]].map(clean_city_name).groupby("city")["city"].count().idxmax()

'new york city'

			count	mean	std	min	25%	50%	75%	max
year	month	day
2016	7	27	272.0	1.617574	1.095349	0.06	0.9000	1.400	2.0000	5.60
		28	308.0	1.448149	0.896851	0.10	0.8775	1.240	1.8000	5.10
		29	309.0	1.640129	1.165952	0.01	0.8800	1.450	1.9000	7.70
		30	329.0	1.615076	1.262618	0.03	0.7000	1.240	2.0000	5.70
		31	278.0	1.750827	1.261577	0.10	0.9000	1.500	2.1475	5.90
	8	1	356.0	1.520056	1.157326	0.04	0.8000	1.245	1.8025	6.10
		2	292.0	1.539418	1.089946	0.05	0.8000	1.300	1.9000	5.50
		3	294.0	1.556327	1.147365	0.01	0.8300	1.200	1.8150	5.10
		4	420.0	1.249190	1.034738	0.05	0.6000	1.000	1.5825	6.30
		5	256.0	1.428789	1.144244	0.10	0.6200	1.185	1.7150	5.70
		6	316.0	1.313228	1.065587	0.09	0.5600	1.100	1.6200	5.40
		7	316.0	1.356994	1.078556	0.10	0.6000	1.120	1.7425	5.10
		8	335.0	1.484925	1.131495	0.02	0.6300	1.200	1.9900	5.10
		9	272.0	1.614779	1.164186	0.10	0.8075	1.300	1.9000	5.09
		10	329.0	1.404742	1.038701	0.02	0.7700	1.170	1.7400	5.40
		11	356.0	1.390534	1.159147	0.04	0.6775	1.100	1.7000	7.20
		12	326.0	1.533282	1.158696	0.04	0.7400	1.200	1.9000	5.40
		13	284.0	1.421901	1.080338	0.10	0.7000	1.105	1.7775	5.20
		14	231.0	1.692684	1.372191	0.05	0.8200	1.200	1.9000	5.90
		15	222.0	1.583964	1.157553	0.07	0.8550	1.300	1.8400	5.40
		16	223.0	1.629910	1.223131	0.10	0.7300	1.300	2.0000	5.10
		17	220.0	1.583682	1.203617	0.10	0.8000	1.210	1.9675	5.20
		18	219.0	1.499772	1.159497	0.04	0.7600	1.200	1.8050	5.90
		19	226.0	1.819469	1.416291	0.11	0.8925	1.340	2.1900	7.40
		20	237.0	1.553207	1.296262	0.02	0.7000	1.200	2.0000	6.40
		21	266.0	1.332368	1.032210	0.03	0.6050	1.140	1.6925	5.10
		22	215.0	1.451488	1.185657	0.01	0.7000	1.180	1.8000	5.60
		23	233.0	1.643391	1.245661	0.04	0.8200	1.300	2.1000	6.20
		24	216.0	1.553194	1.144054	0.03	0.8375	1.290	1.9000	6.80
		25	238.0	1.519328	0.926028	0.10	0.8900	1.400	1.9950	5.90

			count	mean	std	min	25%	50%	75%	max
day	month	year
1	8	2016	356.0	1.520056	1.157326	0.04	0.8000	1.245	1.8025	6.10
2	8	2016	292.0	1.539418	1.089946	0.05	0.8000	1.300	1.9000	5.50
3	8	2016	294.0	1.556327	1.147365	0.01	0.8300	1.200	1.8150	5.10
4	8	2016	420.0	1.249190	1.034738	0.05	0.6000	1.000	1.5825	6.30
5	8	2016	256.0	1.428789	1.144244	0.10	0.6200	1.185	1.7150	5.70
6	8	2016	316.0	1.313228	1.065587	0.09	0.5600	1.100	1.6200	5.40
7	8	2016	316.0	1.356994	1.078556	0.10	0.6000	1.120	1.7425	5.10
8	8	2016	335.0	1.484925	1.131495	0.02	0.6300	1.200	1.9900	5.10
9	8	2016	272.0	1.614779	1.164186	0.10	0.8075	1.300	1.9000	5.09
10	8	2016	329.0	1.404742	1.038701	0.02	0.7700	1.170	1.7400	5.40
11	8	2016	356.0	1.390534	1.159147	0.04	0.6775	1.100	1.7000	7.20
12	8	2016	326.0	1.533282	1.158696	0.04	0.7400	1.200	1.9000	5.40
13	8	2016	284.0	1.421901	1.080338	0.10	0.7000	1.105	1.7775	5.20
14	8	2016	231.0	1.692684	1.372191	0.05	0.8200	1.200	1.9000	5.90
15	8	2016	222.0	1.583964	1.157553	0.07	0.8550	1.300	1.8400	5.40
16	8	2016	223.0	1.629910	1.223131	0.10	0.7300	1.300	2.0000	5.10
17	8	2016	220.0	1.583682	1.203617	0.10	0.8000	1.210	1.9675	5.20
18	8	2016	219.0	1.499772	1.159497	0.04	0.7600	1.200	1.8050	5.90
19	8	2016	226.0	1.819469	1.416291	0.11	0.8925	1.340	2.1900	7.40
20	8	2016	237.0	1.553207	1.296262	0.02	0.7000	1.200	2.0000	6.40
21	8	2016	266.0	1.332368	1.032210	0.03	0.6050	1.140	1.6925	5.10
22	8	2016	215.0	1.451488	1.185657	0.01	0.7000	1.180	1.8000	5.60
23	8	2016	233.0	1.643391	1.245661	0.04	0.8200	1.300	2.1000	6.20
24	8	2016	216.0	1.553194	1.144054	0.03	0.8375	1.290	1.9000	6.80
25	8	2016	238.0	1.519328	0.926028	0.10	0.8900	1.400	1.9950	5.90
27	7	2016	272.0	1.617574	1.095349	0.06	0.9000	1.400	2.0000	5.60
28	7	2016	308.0	1.448149	0.896851	0.10	0.8775	1.240	1.8000	5.10
29	7	2016	309.0	1.640129	1.165952	0.01	0.8800	1.450	1.9000	7.70
30	7	2016	329.0	1.615076	1.262618	0.03	0.7000	1.240	2.0000	5.70
31	7	2016	278.0	1.750827	1.261577	0.10	0.9000	1.500	2.1475	5.90

	city	state	country	shape	duration (seconds)	duration (hours/min)	comments	date posted	latitude	longitude
datetime
10/10/1949 20:30	san marcos	tx	us	cylinder	2700.0	45 minutes	This event took place in early fall around 194...	4/27/2004	29.883056	-97.941111
10/10/1949 21:00	lackland afb	tx	NaN	light	7200.0	1-2 hrs	1949 Lackland AFB&#44 TX. Lights racing acros...	12/16/2005	29.384210	-98.581082
10/10/1955 17:00	chester (uk/england)	NaN	gb	circle	20.0	20 seconds	Green/Orange circular disc over Chester&#44 En...	1/21/2008	53.200000	-2.916667
10/10/1956 21:00	edna	tx	us	circle	20.0	1/2 hour	My older brother and twin sister were leaving ...	1/17/2004	28.978333	-96.645833
10/10/1960 20:00	kaneohe	hi	us	light	900.0	15 minutes	AS a Marine 1st Lt. flying an FJ4B fighter/att...	1/22/2004	21.418056	-157.803611
...	...	...	...	...	...	...	...	...	...	...
10/12/1982 05:30	pearlington	ms	us	light	29.0	29 seconds	Round light which was observed going into spac...	12/12/2013	30.246389	-89.611111
10/12/1985 23:45	swansboro	nc	us	disk	300.0	23:45 - 23:50	My sister and I observed a disk for about 5 mi...	12/2/2000	34.687500	-77.119444
10/12/1988 16:30	melbourne (vic&#44 australia)	NaN	au	cigar	900.0	15 min&#39s	Large cigar shaped craft&#44flying sideways&#4...	9/29/2004	-37.813938	144.963425
10/12/1994 11:55	schenectady	ny	us	other	120.0	a few minutes	I had just picked up my Daugther (2yrs old) fr...	4/1/2001	42.814167	-73.940000
10/12/1994 15:00	monticello	ky	us	chevron	120.0	1-2 minutes	Triangular/chevron small object with fixed lig...	10/30/2006	36.829722	-84.849167

	city	state	country	shape	duration (seconds)	duration (hours/min)	comments	date posted	latitude	longitude
datetime
10/10/1949 20:30	san marcos	tx	us	cylinder	2700.0	45 minutes	This event took place in early fall around 194...	4/27/2004	29.883056	-97.941111
10/10/1949 21:00	lackland afb	tx	NaN	light	7200.0	1-2 hrs	1949 Lackland AFB&#44 TX. Lights racing acros...	12/16/2005	29.384210	-98.581082
10/10/1955 17:00	chester (uk/england)	NaN	gb	circle	20.0	20 seconds	Green/Orange circular disc over Chester&#44 En...	1/21/2008	53.200000	-2.916667
10/10/1956 21:00	edna	tx	us	circle	20.0	1/2 hour	My older brother and twin sister were leaving ...	1/17/2004	28.978333	-96.645833
10/10/1960 20:00	kaneohe	hi	us	light	900.0	15 minutes	AS a Marine 1st Lt. flying an FJ4B fighter/att...	1/22/2004	21.418056	-157.803611
...	...	...	...	...	...	...	...	...	...	...
10/12/1982 05:30	pearlington	ms	us	light	29.0	29 seconds	Round light which was observed going into spac...	12/12/2013	30.246389	-89.611111
10/12/1985 23:45	swansboro	nc	us	disk	300.0	23:45 - 23:50	My sister and I observed a disk for about 5 mi...	12/2/2000	34.687500	-77.119444
10/12/1988 16:30	melbourne (vic&#44 australia)	NaN	au	cigar	900.0	15 min&#39s	Large cigar shaped craft&#44flying sideways&#4...	9/29/2004	-37.813938	144.963425
10/12/1994 11:55	schenectady	ny	us	other	120.0	a few minutes	I had just picked up my Daugther (2yrs old) fr...	4/1/2001	42.814167	-73.940000
10/12/1994 15:00	monticello	ky	us	chevron	120.0	1-2 minutes	Triangular/chevron small object with fixed lig...	10/30/2006	36.829722	-84.849167

Groupby and Indexing¶

Groupby in plain Python¶

Groupby in Pandas¶

Hierarchical indexing¶

Practice: UFO sightings¶

String accessor functions¶

Apply your own functions¶

	year	month	day	latitude	longitude	name	magnitude
id
nc72666881	2016	7	27	37.672333	-121.619000	California	1.43
us20006i0y	2016	7	27	21.514600	94.572100	Burma	4.90
nc72666891	2016	7	27	37.576500	-118.859167	California	0.06
nc72666896	2016	7	27	37.595833	-118.994833	California	0.40
nn00553447	2016	7	27	39.377500	-119.845000	Nevada	0.30
ak13805337	2016	7	27	61.296300	-152.460000	Alaska	1.80
hv61354276	2016	7	27	19.423500	-155.609833	Hawaii	1.00
ak13805339	2016	7	27	61.301900	-152.450700	Alaska	2.00
ci37640584	2016	7	27	35.503000	-118.405833	California	1.20
nc72666901	2016	7	27	37.673000	-121.613333	California	1.67
ci37640592	2016	7	27	33.588833	-116.816500	California	0.48
nn00553416	2016	7	27	38.263800	-118.735100	Nevada	0.90