import pandas as pd

seattle_air = pd.read_csv("seattle_air.csv", index_col="Time", parse_dates=True)
seattle_air.groupby(seattle_air.index.year).count()

original_seattle_air = seattle_air
original_seattle_air

seattle_air["PM2.5"].dropna()

Time
2017-04-06 00:00:00    6.8
2017-04-06 01:00:00    5.3
2017-04-06 02:00:00    5.3
2017-04-06 03:00:00    5.6
2017-04-06 04:00:00    5.9
                      ... 
2022-04-06 19:00:00    5.1
2022-04-06 20:00:00    5.0
2022-04-06 21:00:00    5.3
2022-04-06 22:00:00    5.2
2022-04-06 23:00:00    5.2
Name: PM2.5, Length: 43059, dtype: float64

seattle_air

seattle_air

seattle_air.dropna() # By default, it does not modify the underlying data frame

seattle_air

seattle_air = seattle_air.dropna()
seattle_air

original_seattle_air

seattle_air = seattle_air["PM2.5"].dropna()

seattle_air

Time
2017-04-06 00:00:00    6.8
2017-04-06 01:00:00    5.3
2017-04-06 02:00:00    5.3
2017-04-06 03:00:00    5.6
2017-04-06 04:00:00    5.9
                      ... 
2022-04-06 19:00:00    5.1
2022-04-06 20:00:00    5.0
2022-04-06 21:00:00    5.3
2022-04-06 22:00:00    5.2
2022-04-06 23:00:00    5.2
Name: PM2.5, Length: 43059, dtype: float64

original_seattle_air

# Doesn't actually appear to modify the data frame
# (It does actually change the result, but just doesn't look like it.)
seattle_air["PM2.5"] = seattle_air["PM2.5"].dropna()
seattle_air

seattle_air[seattle_air.isna()]

original_seattle_air

class DataFrame:
    """Represents two-dimensional tabular data structured around an index and column names."""

    def __init__(self, index, columns, data):
        """Initializes a new DataFrame object from the given index, columns, and tabular data."""
        print("Initializing DataFrame")
        # seattle_air["PM2.5"] = ...
        self.index = index
        self.columns = columns
        self.data = data

    # seattle_air.dropna() -> dropna(seattle_air, ...)
    def dropna(self, inplace=False):
        """"
        Drops all rows containing NaN from this DataFrame. If inplace, returns None and modifies
        self. If not inplace, returns a new DataFrame without modifying self.
        """
        print("Calling dropna")
        if not inplace:
            return DataFrame([...], [...], [...])
        else:
            self.columns = [...]
            self.index = [...]
            self.data = [...]
            return None

    # seattle_air["PM2.5"] -> seattle_air.__getitem__("PM2.5") -> DataFrame.__getitem__(seattle_air, "PM2.5")
    def __getitem__(self, column_or_indexer):
        """Given a column or indexer, returns the selection as a new Series or DataFrame object."""
        print("Calling __getitem__")
        if column_or_indexer in self.columns:
            return "Series" # placeholder for a Series
        else: # If I have a boolean Series for filtering...
            return DataFrame([...], [...], [...])

    def __repr__(self):
        """Return a string representation of this object that can be evaluated in Python to reproduce the object."""
        return f"DataFrame({self.index}, {self.columns}, {self.data})"


example = DataFrame([0, 1, 2], ["PM2.5"], [10, 20, 30])
example["PM2.5"]

Initializing DataFrame
Calling __getitem__

'Series'

example["PM2.5"]["2024-04-06"]
# example.__getitem__("PM2.5").__getitem__("2024-04-06")
# Series.__getitem__(DataFrame.__getitem__(example, "PM2.5"), "2024-04-06")

# Remember that Python will pass in the instance for self as the first argument!
# In this example, we end up accidentally getting two arguments for staff being passed in!
# staff.__getitem__(staff, "Hours").__getitem__(staff, "Iris")
staff.__getitem__("Hours").__getitem__("Iris")

# df.__getitem__("foo").__setitem__(df["bar"] > 5, 100)
# Actually has a potentially subtle bug in it! Could be potentially confusing what is intended.
df["foo"][df["bar"] > 5] = 100

# df.__getitem__("foo") -> Series but is this column associated with the original dataframe?
#                          or is it a copy?
#                      .__setitem__(df["bar"] > 5, 100)
#                       -> is this changing the original df? Or is it just changing some temporary Series?
# Pandas' solution is copy on write: in future versions of Python this code simply will not work.

# We are doing one df.loc.__setitem__( (df["bar"] > 5, "foo"), 100 )
df.loc[df["bar"] > 5, "foo"] = 100

# Why does using .loc avoid the problem above with chained assignments?
# Because it does not materialize a temporary Series (which was the cause of all the potential confusion)

example

DataFrame([0, 1, 2], ['PM2.5'], [10, 20, 30])

DataFrame([0, 1, 2], ['PM2.5'], [10, 20, 30])

Initializing DataFrame

DataFrame([0, 1, 2], ['PM2.5'], [10, 20, 30])

# 1. Write the template out and convert it to Python code for each method.
# 2. Figure out what data/state you need to keep track of in your class.
# 3. Sometimes, it can be helpful to start with the initializer.
#    Other times, it can be helpful to start with the other methods.
#    And it's probably useful to write them somewhat in tandem (at the same time).

class Student:
    """..."""
    # Is there a notion of a static method? Yes, in Python using a special decorator.

    def __init__(self, number: int, filename: str) -> None:
        """..."""
        self.name = filename[:-4]
        self.number = number
        self.filename = filename
        self.courses = {}
        with open(filename) as f:
            for line in f.readlines():
                course, credits = line.split()
                self.courses[course] = int(credits)

    def __getitem__(self, course: str) -> int | None:
        """..."""
        if course not in self.courses:
            return None
        return self.courses[course]

    def get_courses(self) -> list[str]:
        """..."""
        return list(self.courses)

    def __repr__(self):
        """Return a string representation of this object that can be evaluated in Python to reproduce the object."""
        return f"Student({self.number}, '{self.filename}')"


nicole = Student(1234567, "nicole.txt")
for course in nicole.get_courses():
    print(course, nicole[course])

<cell>31: error: Function is missing a type annotation  [no-untyped-def]

CSE163 4
PHIL100 4
CSE390HA 1

nicole["CSE163"]

4

# To access information that was passed to the initializer, we need to assign it as a field
nicole.number

1234567

nicole

Student(1234567, 'nicole.txt')

Student(1234567, 'nicole.txt')

Student(1234567, 'nicole.txt')

!pip install -q nb_mypy
%reload_ext nb_mypy
%nb_mypy mypy-options --strict

[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: pip install --upgrade pip

Version 1.0.5

# How to sort students?
students = [nicole, Student(10, "nicole.txt")]
students[0].name

'nicole'

students

[Student(1234567, 'nicole.txt'), Student(10, 'nicole.txt')]

# How do we define a way to sort these Student objects? It's ambiguous!
sorted(students)

<cell>2: error: Value of type variable "SupportsRichComparisonT" of "sorted" cannot be "Student"  [type-var]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[25], line 2
      1 # How do we define a way to sort these Student objects? It's ambiguous!
----> 2 sorted(students)

TypeError: '<' not supported between instances of 'Student' and 'Student'

# key= takes a function as an argument, which when given an object from your list,
#      it should return something that can be sorted in Python (str, int, float)

def get_name(student: Student) -> str:
    return student.name

def get_number(student: Student) -> int:
    return student.number

sorted(students, key=get_number) # Do not call the function with parentheses!

[Student(10, 'nicole.txt'), Student(1234567, 'nicole.txt')]

# Shorter way to do the same thing using lambdas (inline function definition)!
sorted(students, key=lambda student: student.number)

[Student(10, 'nicole.txt'), Student(1234567, 'nicole.txt')]

# Even more "Pythonic" (definitely some problems with limited framings of what is Pythonic)

from operator import attrgetter

sorted(students, key=attrgetter("number"))

[Student(10, 'nicole.txt'), Student(1234567, 'nicole.txt')]

# Pretty handy to write type annotations as part of your templating.
# How does Python evaluate a class?
#   Look inside the class for definitions, and bind them to the class!
#   University.__init__ will get the __init__ function definition

class University:
    """..."""

    def __init__(self, name: str, students: list[Student] = None) -> None:
        """Initializes a new University object with the given name and list of students."""
        if students is None:
            students = []
        self.students = students # Could be a dictionary

    def enrollments(self) -> list[Student]:
        """Returns a list of all students enrolled in this University sorted alphabetically by name."""
        return sorted(self.students, key=attrgetter("name"))

    def enroll(self, student: Student) -> None:
        """Enrolls the given student in this University"""
        self.students.append(student)

    def roster(self, course: str) -> list[Student]:
        """Given a course name, return the list of students who are enrolled in that course."""
        result = []
        # Loop over the list of students,
        # checking each one to see if they are enrolled in the course
        for student in self.students:
            if student[course] is not None: # If the student is taking the course?
                result.append(student)
        # In the Search homework, you will need to sort the documents by their relevance!
        return result

    # What other methods might you like?


uw = University("Udub", [nicole])
uw.enrollments()

[Student(1234567, 'nicole.txt')]

# Constructing a new University with no students, means that I don't have any students!
wsu = University("Wazzu")
wsu.enrollments()

[]

# Construct another new University with no students: it should also have no students.
seattle_u = University("SeattleU")
seattle_u.enrollments()

[]

seattle_u.enroll(nicole)
seattle_u.enrollments()

[Student(1234567, 'nicole.txt')]

wsu.enrollments()

[Student(1234567, 'nicole.txt')]

# UW has a separate enrollment list because passed in a specific argument for students!
uw.enroll(nicole)
uw.enrollments()

[Student(1234567, 'nicole.txt'), Student(1234567, 'nicole.txt')]

wsu.enrollments()

[Student(1234567, 'nicole.txt')]

seattle_u.enrollments()

[Student(1234567, 'nicole.txt')]

	PM2.5
Time
2017	6283
2018	8540
2019	8597
2020	8683
2021	8664
2022	2292

Objects¶

Reference semantics¶

Defining classes¶

Practice: `Student` class¶

Type annotations¶

Practice: `University` class¶

Mutable default parameters¶

	PM2.5
Time
2017-04-06 00:00:00	6.8
2017-04-06 01:00:00	5.3
2017-04-06 02:00:00	5.3
2017-04-06 03:00:00	5.6
2017-04-06 04:00:00	5.9
...	...
2022-04-06 19:00:00	5.1
2022-04-06 20:00:00	5.0
2022-04-06 21:00:00	5.3
2022-04-06 22:00:00	5.2
2022-04-06 23:00:00	5.2

Objects¶

Reference semantics¶

Defining classes¶

Practice: Student class¶

Type annotations¶

Practice: University class¶

Mutable default parameters¶

Practice: `Student` class¶

Practice: `University` class¶