pandas cheat sheet

You haven't touched pandas for awhile and feeling a bit rusty?

Here you go!

import numpy as np

import pandas as pd

np.set_printoptions(precision=4, suppress=True)

pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80

Dataframe

Read csv and summarize data

df = pd.read_csv("data/toy-dataset.zip")  # or read with

# if Number exists override the default index.
df.set_index("Number", inplace=True)

print(" --- Info ---")
print(df.info())
print("")
print(" --- First Five Rows ---")
print(df.head(n=5))
print("")
print(" --- Last Two Rows ---")
print(df.tail(n=2))
print("")

print(" --- Describe ---")
# show basic stats for numerical columns
df.describe()

Basic operations

a = pd.DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
    index=list("abcd"),
    columns=["one", "two"],
)

print("--- print dataframe ----")
print(a)
print("")
print("--- column \"one\" ----")
print(a.one)
print("")
print("--- sum cols ----")
print(a.sum(axis=0))  # sum up each column
print("")
print("--- sum rows ----")
print(a.sum(axis=1))  # sum up each row (across all columns)
print("")
print("--- mean by rows ----")
print(a.mean(axis="columns", skipna=False)) # better to use axis=1
print("")
print("--- INDICES for max value by columns ----")
print(a.idxmax(axis=0)) # note that indices in this dataframe are letters
print("")
print("--- rows that contain the max values by column ----")
print(a.loc[a.idxmax(axis=0)]) # a[[b,d]]
print("")
print("--- INTEGER INDICES where smallest value ----")
print(a.one.argmin())  # take the Series ("one" column) and return int index of smallest value

--- print dataframe ----
    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

--- column "one" ----
a    1.40
b    7.10
c     NaN
d    0.75
Name: one, dtype: float64

--- sum cols ----
one    9.25
two   -5.80
dtype: float64

--- sum rows ----
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

--- mean by rows ----
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

--- INDICES for max value by columns ----
one    b
two    d
dtype: object

--- rows that contain the max values by column ----
    one  two
b  7.10 -4.5
d  0.75 -1.3

--- INTEGER INDICES where smallest value ----
3

Create dataframe with some random data

import pandas as pd

import numpy as np

np.set_printoptions(precision=4, suppress=True)

np.random.seed(42)

num_rows = 20
df = pd.DataFrame(
    {
        "date": np.random.choice(
            pd.date_range("2018-01-01", "2018-06-18", freq="D"), num_rows
        ),
        "analysis_tool": np.random.choice(
            ["pandas", "r", "julia", "sas", "stata", "spss"], num_rows
        ),
        "database": np.random.choice(
            ["postgres", "mysql", "sqlite", "oracle", "sql server", "db2"], num_rows
        ),
        "os": np.random.choice(
            ["windows 10", "ubuntu", "mac os", "android", "ios", "windows 7", "debian"],
            num_rows,
        ),
        "num1": np.random.randn(num_rows) * 100,
        "num2": np.random.uniform(0, 1, num_rows),
        "num3": np.random.randint(100, size=num_rows),
        "bool": np.random.choice([True, False], num_rows),
    },
    columns=["date", "analysis_tool", "num1", "database", "num2", "os", "num3", "bool"],
)

Create dataframe from list of lists

data = [
    ["DS", "Linked_list", 10],
    ["DS", "Stack", 9],
    ["DS", "Queue", 7],
    ["Algo", "Greedy", 8],
    ["Algo", "DP", 6],
    ["Algo", "BackTrack", 5],
]

# Create the pandas DataFrame
df = pd.DataFrame(data, columns=["Category", "Name", "Marks"])

Series vs Dataframe

Libraries like sklearn want dataframes as input even if the dataframe only has one column.

df = pd.DataFrame({
    "product": ["Beer", "Wine", "Whiskey"],
    "price": [6.5, 9.0, 12.0]
})

# Series
s = df['price']

# Dataframe
d = df[['price']]

# Series -> Dataframe
d2 = s.to_frame()

NaN

# True or False
df["my_column"].hasnans

# count the rows where the value is nan
df["my_column"].isna().sum()

# count the non-nan rows
df["my_column"].notna().sum()

# fill the na values with something
median = df["my_column"].median()
df["my_column"] = df["my_column"].fillna(median)

# for each row, checks if any column is NaN
null_rows_idx = df.isna().any(axis=1)
df.loc[null_rows_idx].head(20)

loc, iloc and slices

# note that order_id is index
df = pd.DataFrame({
    "order_id": [101, 102, 103, 104, 105],
    "table": [1, 1, 2, 3, 3],
    "item": ["Beer", "Wine", "Burger", "Beer", "Dessert"],
    "price": [6.0, 9.0, 14.0, 6.0, 7.5]
}).set_index("order_id")

#############
# loc - label-based (names, conditions)
# Note: loc slicing is end-inclusive!
#############

df.loc[[101, 104], ["item", "price"]]

df.loc[df["price"] > 7, ["item", "price"]]

#############
# iloc - position-based (row/column numbers)
# Note: iloc slicing is end-exclusive!
#############

# get first row
df.iloc[0]

# first 3 rows, and last two columns
df.iloc[0:3, -2:]

# single cell
df.iloc[2, 1]

convert columns to specific data type

import pandas as pd

df = pd.DataFrame({
    "order_id": ["1001", "1002", "1003"],
    "order_date": ["2025-01-05", "2025-01-06", "2025-01-06"],
    "price": ["12.50", "9.00", "15.75"],
    "is_refund": ["False", "True", "False"]
})

# all column types are "object"
print(df.dtypes)
print("")

df["order_id"] = df["order_id"].astype("int64", errors="raise")
df["order_date"] = pd.to_datetime(df["order_date"], errors="raise")
df["price"] = pd.to_numeric(df["price"], errors="raise")
df["is_refund"] = df["is_refund"].map({"True": True, "False": False})

print(df.dtypes)

order_id      object
order_date    object
price         object
is_refund     object
dtype: object

order_id               int64
order_date    datetime64[ns]
price                float64
is_refund               bool
dtype: object

Duplicates

df = pd.DataFrame({
    "order_id": [101, 101, 102, 103, 103, 103],
    "item": ["IPA", "IPA", "Lager", "IPA", "IPA", "Stout"],
    "price": [8, 8, 7, 8, 8, 9]
})

# returns a series of False and True. True means this row is duplicated
# False -> First occurrence
# True  -> All other occurrences
df.duplicated()

# get all the unique rows
df.drop_duplicates(keep="last") # first

# get the rows that are duplicated
df[df.duplicated()]

# only consider a few columns
df[df.duplicated(subset=["order_id", "item"])]

# drop duplicates
df.drop_duplicates(subset=["order_id", "item"], keep="first") # or "last"

# see rows what rows have duplicates. `keep` indicates that we don't care if a row is the first or any other occurrence
df[df.duplicated(subset=["order_id", "item"], keep=False)]

# get all unique rows and count its occurrence
df.groupby(["order_id", "item"]).size().reset_index(name="count")

Grouping

import pandas as pd

data = [
    ("2025-12-20", "Beer", "IPA", 8.00),
    ("2025-12-20", "Beer", "Lager", 7.00),
    ("2025-12-20", "Wine", "Red", 12.00),
    ("2025-12-21", "Beer", "IPA", 8.00),
    ("2025-12-21", "Wine", "White", 11.00),
    ("2025-12-21", "Cocktail", "Margarita", 14.00),
    ("2025-12-21", "Beer", "IPA", 8.00),
]

df = pd.DataFrame(
    data,
    columns=["date", "category", "item", "price"]
)

df["date"] = pd.to_datetime(df["date"])

# groupby and sum will create a series, where the values are prices
revenue_by_category = (
    df
    .groupby("category")["price"]
    .sum()
    .reset_index(name="total_revenue") # rename the price column to "total_revenue"
)

print(revenue_by_category)
print("")

avg_price = (
    df
    .groupby(["category", "item"])["price"]
    .mean()
    .reset_index()
)

print(avg_price)

   category  total_revenue
0      Beer           31.0
1  Cocktail           14.0
2      Wine           23.0

   category       item  price
0      Beer        IPA    8.0
1      Beer      Lager    7.0
2  Cocktail  Margarita   14.0
3      Wine        Red   12.0
4      Wine      White   11.0

Reindex

Use reindex() when the shape of an operation is correct, but the index is incomplete.

Operations that create Series are usually group_by, value_counts, resample, pivot_table, etc

df = pd.DataFrame({
    "day": ["Mon", "Mon", "Wed", "Wed", "Wed"], # only Monday and Wednesday
    "sales": [100, 200, 50, 60, 70]
})

print(df)
print("")

# aggregate (this will create a Series)
daily = df.groupby("day")["sales"].sum()
print(daily)
print("")

# make sure all weekdays are in the data
all_days = ["Mon", "Tue", "Wed", "Thu", "Fri"]
daily = daily.reindex(all_days, fill_value=0)
print(daily)
print("")

Reset_Index

After aggregating operations like group_by some of the columns are now in the index. If, for example, you want to plot the data a reset_index will put columns back and create a new numerical index.

df = pd.DataFrame({
    "country": ["US", "US", "DE", "DE", "FR"],
    "year": [2023, 2024, 2023, 2024, 2024],
    "revenue": [10, 12, 7, 9, 5]
})

print(df)

g = df.groupby(["country", "year"])["revenue"].sum()

print(g)

# Key Error!
# g["country"]

# need reset index (make index into columns)
g = g.reset_index()
print(g)

  country  year  revenue
0      US  2023       10
1      US  2024       12
2      DE  2023        7
3      DE  2024        9
4      FR  2024        5
country  year
DE       2023     7
         2024     9
FR       2024     5
US       2023    10
         2024    12
Name: revenue, dtype: int64
  country  year  revenue
0      DE  2023        7
1      DE  2024        9
2      FR  2024        5
3      US  2023       10
4      US  2024       12

Drop Columns

df = load_flicker_data()
print(df.info())

drop_columns = [
    "Edition Statement",
    "Corporate Author",
    "Corporate Contributors",
    "Former owner",
    "Engraver",
    "Contributors",
    "Issuance type",
    "Shelfmarks",
]

df.drop(drop_columns, inplace=True, axis=1)
df.info()

dropna

Drop every row that has a NaN somewhere:

import pandas as pd

df = pd.DataFrame({
    "vehicle_id": ["A", "B", "C", "D"],
    "speed": [55, None, 70, None],
    "timestamp": ["2024-01-01", "2024-01-02", None, "2024-01-04"]
})

df.dropna()

#   vehicle_id  speed   timestamp
# 0          A   55.0  2024-01-01

Sometimes NaN is ok for some columns. So the next example only drop rows for a subset of columns:

df.dropna(subset=["speed"])

#   vehicle_id  speed   timestamp
# 0          A   55.0  2024-01-01
# 2          C   70.0        None

Convert Columns

df["column"] = df["column"].astype("int")

Factorize

# turn messy categories into numeric IDs
df = pd.DataFrame({
    "beer_style": [
        "IPA", "Stout", "IPA", "Pilsner",
        "Stout", "IPA", "Lager"
    ]
})

# uniques are not needed here
codes, uniques = pd.factorize(df["beer_style"])
df['style_id'] = codes

  beer_style  style_id
0        IPA         0
1      Stout         1
2        IPA         0
3    Pilsner         2
4      Stout         1
5        IPA         0
6      Lager         3

Get_Dummies

Mostly used in Machine learning when feeding data into a model. A model cannot understand a string like "Urban".

So, we need to perform a technique called one hot encoding. get_dummies() creates for each store type a yes/no column.

Let's take this example:

df = pd.DataFrame({
    "store_type": ["Urban", "Suburban", "Urban", "Rural"],
    "revenue": [120, 80, 150, 60]
})

print(df)
print("")

X = pd.get_dummies(df["store_type"])

print(X)
print("")

# one liner
print(pd.get_dummies(df, columns=["store_type"], prefix="key"))

  store_type  revenue
0      Urban      120
1   Suburban       80
2      Urban      150
3      Rural       60

   Rural  Suburban  Urban
0  False     False   True
1  False      True  False
2  False     False   True
3   True     False  False

   revenue  key_Rural  key_Suburban  key_Urban
0      120      False         False       True
1       80      False          True      False
2      150      False         False       True
3       60       True         False      False

Cut

Create a bunch of equal sized bins to categories a numerical column.

df = pd.DataFrame({
    "customer_id": range(1, 13),
    "total_spend": [
        12, 35, 48, 52, 67, 80,
        120, 150, 220, 310, 450, 900
    ]
})

num_bins = 3

# get the bin's intervals
_, bins = pd.cut(df['total_spend'], num_bins=3, retbins=True)


df['spending_category'] = pd.cut(df['total_spend'], num_bins=3, labels=[0,1,2])

print(df)

    customer_id  total_spend spending_category
0             1           12                 0
1             2           35                 0
2             3           48                 0
3             4           52                 0
4             5           67                 0
5             6           80                 0
6             7          120                 0
7             8          150                 0
8             9          220                 0
9            10          310                 1
10           11          450                 1
11           12          900                 2

Mapping

df = pd.DataFrame({
    "beer_style_raw": [
        "IPA", "India Pale Ale", "I.P.A.",
        "Stout", "Imperial Stout", "Pils"
    ]
})

style_map = {
    "IPA": "IPA",
    "India Pale Ale": "IPA",
    "I.P.A.": "IPA",
    "Stout": "Stout",
    "Imperial Stout": "Stout",
    "Pils": "Pilsner"
}

df['beer_style'] = df.beer_style_raw.map(style_map)

print(df)

   beer_style_raw beer_style
0             IPA        IPA
1  India Pale Ale        IPA
2          I.P.A.        IPA
3           Stout      Stout
4  Imperial Stout      Stout
5            Pils    Pilsner

Searching/Filtering

The searching or filtering syntax always trips me up. Hopefully this section will be last one I will write.

Let's make the dataframe example a bit larger.

import pandas as pd

df = pd.DataFrame([
    # menu_id, venue, city, state, category, item, producer, price, currency, abv, rating, in_stock, on_tap, updated_at
    (101, "The Fox & Hound", "New York", "NY", "Beer",  "Guinness Draught",          "Guinness",   9.00, "USD", 4.2, 4.7, True,  True,  "2025-12-20 18:15"),
    (101, "The Fox & Hound", "New York", "NY", "Beer",  "Heineken",                  "Heineken",   8.00, "USD", 5.0, 4.0, True,  False, "2025-12-20 18:15"),
    (102, "Sunset Cantina",  "Austin",   "TX", "Spirit","Margarita (House)",         "Tequila",   14.00, "USD", None, 4.4, True,  False, "2025-12-21 20:05"),
    (102, "Sunset Cantina",  "Austin",   "TX", "Spirit","Old Fashioned",             "Bourbon",   16.00, "USD", None, 4.6, True,  False, "2025-12-21 20:05"),
    (103, "Harbor Grill",    "Miami",    "FL", "Wine",  "Sauvignon Blanc (Glass)",   "Kim Crawford",13.00,"USD", None, 4.1, False, False, "2025-12-22 12:30"),
    (103, "Harbor Grill",    "Miami",    "FL", "Wine",  "Cabernet Sauvignon (Glass)","Josh Cellars",14.00,"USD", None, 4.2, True,  False, "2025-12-22 12:30"),
    (104, "Brew Lab",        "San Diego","CA", "Beer",  "Pliny the Elder",           "Russian River",10.00,"USD", 8.0, 4.8, True,  True,  "2025-12-23 19:10"),
    (104, "Brew Lab",        "San Diego","CA", "Beer",  "Hazy IPA",                  "Local",      9.50, "USD", 6.5, 4.3, True,  True,  "2025-12-23 19:10"),
    (105, "Green Room",      "Seattle",  "WA", "Beer",  "Non-Alcoholic IPA",         "Athletic",   7.00, "USD", 0.5, 4.0, True,  False, "2025-12-23 16:40"),
    (106, "Pasta House",     "Boston",   "MA", "Wine",  "Chianti (Bottle)",          "Ruffino",   42.00, "USD", None, 4.0, True,  False, "2025-12-19 21:00"),
], columns=[
    "menu_id","venue","city","state","category","item","producer",
    "price","currency","abv","rating","in_stock","on_tap","updated_at"
])

# convert to datetime type
df["updated_at"] = pd.to_datetime(df["updated_at"])
# df.info()

##########################
# simple filtering
##########################
df[(df["category"] == "Beer") & (df["state"] == "NY")]

df[df["state"].isin(["NY", "CA", "TX"])]
df[~df["state"].isin(["NY", "CA", "TX"])]

##########################
# string search
##########################
df[df["item"].str.contains("ipa", case=False, na=False)]

df[df["producer"].str.startswith("Ru", na=False)]


##########################
# Numeric filters: between, missing values, top-N
##########################
df[df["price"].between(9, 14)]

df[df["abv"].notna()]

df.nlargest(3, "rating")[["venue","item","rating"]]

##########################
# Datetime range filters
##########################
df[df["updated_at"] >= "2025-12-22"]

df[df["updated_at"].between("2025-12-21", "2025-12-23 18:00")]

##########################
# compound conditions plus sorting
##########################

recommend = df[
    (df["in_stock"]) &
    ((df["category"] == "Beer") | (df["category"] == "Wine")) &
    (df["rating"] >= 4.2)
].sort_values(["rating","price"], ascending=[False, True])

recommend[["venue","category","item","price","rating"]]

##########################
# “Group-wise” filtering
##########################

beer_counts = df[df["category"] == "Beer"].groupby("venue").size()
venues_with_2_beers = beer_counts[beer_counts >= 2].index

df[(df["category"] == "Beer") & (df["venue"].isin(venues_with_2_beers))]

query

Query() gives an easy way to specify conditions. But be aware that it's NOT sql.

Things that should work are: and, or, not, in, comparisons (e.g ==, >, <=), parentheses

Things that might cause troubles: like, between, subqueries

df.query("category == 'Beer' and in_stock == True and price >= 9")

Copy or Drop

When you filter a dataframe and need to mutate the result use copy().

df = pd.DataFrame({
    "venue": ["A", "B", "C", "D"],
    "category": ["Beer", "Wine", "Beer", "Spirit"],
    "price": [8, 15, 12, 18],
    "rating": [4.0, 4.5, 3.8, 4.7]
})

# most recommended
df_filtered = df[df["price"] >= 10].copy()

# if you really must drop
df_dropped = df.drop(df[df["category"] == "Beer"].index)

Transform

Use case: Give me a per-group value aligned back to each row.

Below is a small example showing how to remove outlier rows by filtering out speeds that fall outside the 1st–99th percentile range.

import pandas as pd

data = [
    {"vehicle_id": "V001", "speed": 32},
    {"vehicle_id": "V001", "speed": 35},
    {"vehicle_id": "V001", "speed": 34},
    {"vehicle_id": "V001", "speed": 120},   # extreme high
    {"vehicle_id": "V001", "speed": 5},     # extreme low

    {"vehicle_id": "V002", "speed": 55},
    {"vehicle_id": "V002", "speed": 58},
    {"vehicle_id": "V002", "speed": 60},
    {"vehicle_id": "V002", "speed": 62},
    {"vehicle_id": "V002", "speed": 140},   # extreme high

    {"vehicle_id": "V003", "speed": 25},
    {"vehicle_id": "V003", "speed": 27},
    {"vehicle_id": "V003", "speed": 26},
    {"vehicle_id": "V003", "speed": 28},
    {"vehicle_id": "V003", "speed": 1},     # extreme low
]

df = pd.DataFrame(data)

lower = df.groupby('vehicle_id')['speed'].transform(lambda x: x.quantile(0.01))
upper = df.groupby('vehicle_id')['speed'].transform(lambda x: x.quantile(0.99))

no_outliers_df = df[(df['speed'] > lower) & (df['speed'] < upper)]

Apply

Use case: Give me a custom result per group.

This can be pretty slow sometimes. Have a look here to speed it up: pandarallel.

The following example applies some tax rates to items and then creates three new columns.

df = pd.DataFrame({
    "order_id": [1, 2, 3],
    "subtotal": [42.50, 18.00, 64.25],
    "is_alcohol": [True, False, True]
})

def compute_costs(row):
    tax_rate = 0.10 if row["is_alcohol"] else 0.07
    tax = row["subtotal"] * tax_rate
    tip = row["subtotal"] * 0.20
    total = row["subtotal"] + tax + tip
    return pd.Series([tax, tip, total])

df[["tax", "tip", "total"]] = df.apply(compute_costs, axis=1)

Pivots

Pandas clearly wins in terms of pivoting data to SQL! I sometimes download data from a database, pivot, and then reupload back to the database.

import pandas as pd

df = pd.DataFrame([
    # NYC
    {"date":"2025-12-01", "store":"NYC", "channel":"dine_in",  "category":"Beer",  "revenue":120},
    {"date":"2025-12-01", "store":"NYC", "channel":"takeout",  "category":"Beer",  "revenue":60},
    {"date":"2025-12-02", "store":"NYC", "channel":"dine_in",  "category":"Wine",  "revenue":90},
    {"date":"2025-12-02", "store":"NYC", "channel":"delivery", "category":"Beer",  "revenue":40},
    {"date":"2025-12-03", "store":"NYC", "channel":"delivery", "category":"Wine",  "revenue":35},
    {"date":"2025-12-04", "store":"NYC", "channel":"takeout",  "category":"Wine",  "revenue":50},

    # SF
    {"date":"2025-12-03", "store":"SF",  "channel":"dine_in",  "category":"Beer",  "revenue":110},
    {"date":"2025-12-03", "store":"SF",  "channel":"delivery", "category":"Wine",  "revenue":55},
    {"date":"2025-12-05", "store":"SF",  "channel":"delivery", "category":"Beer",  "revenue":45},
    {"date":"2025-12-05", "store":"SF",  "channel":"takeout",  "category":"Wine",  "revenue":40},
])

df["date"] = pd.to_datetime(df["date"])
df["week"] = df["date"].dt.to_period("W").astype(str)

#######################
# Pivot: revenue by (store, week) x channel
#######################

# aggfuncs
# numeric aggfunc: size, mean, median, min, max, std, var
# counting aggfunc: count (number of non-null rows), size (number of all rows), nunique (number of unique rows)

rev_pivot = df.pivot_table(
    index=["store", "week"],
    columns="channel",
    values="revenue",
    aggfunc="sum", # or ["sum", "mean", "size"],
    fill_value=0,
)


##################
# Normalization: convert each row to shares (each row sums to 1.0)
##################

rev_share = rev_pivot.div(rev_pivot.sum(axis=1), axis=0)

print("Revenue pivot:\n", rev_pivot, "\n")

# the sum per row is 1
print("Row-normalized shares:\n", rev_share.round(3))

Revenue pivot:
 channel                      delivery  dine_in  takeout
store week                                             
NYC   2025-12-01/2025-12-07        75      210      110
SF    2025-12-01/2025-12-07       100      110       40 

Row-normalized shares:
 channel                      delivery  dine_in  takeout
store week                                             
NYC   2025-12-01/2025-12-07      0.19    0.532    0.278
SF    2025-12-01/2025-12-07      0.40    0.440    0.160

Crosstab

crosstab() is a nice quick shortcut to pivoting (when counting) and have normalizing done at the same time

visits = pd.DataFrame([
    {"customer":"c1", "daypart":"Lunch",  "drink":"Beer"},
    {"customer":"c2", "daypart":"Lunch",  "drink":"Beer"},
    {"customer":"c3", "daypart":"Lunch",  "drink":"Wine"},
    {"customer":"c4", "daypart":"Dinner", "drink":"Wine"},
    {"customer":"c5", "daypart":"Dinner", "drink":"Beer"},
    {"customer":"c6", "daypart":"Dinner", "drink":"Cocktail"},
    {"customer":"c7", "daypart":"Late",   "drink":"Beer"},
    {"customer":"c8", "daypart":"Late",   "drink":"Cocktail"},
    {"customer":"c9", "daypart":"Late",   "drink":"Cocktail"},
])

# Counts table
counts = pd.crosstab(index=visits["daypart"], columns=visits["drink"])

# Normalization options
row_pct = pd.crosstab(visits["daypart"], visits["drink"], normalize="index")   # per-daypart distribution
col_pct = pd.crosstab(visits["daypart"], visits["drink"], normalize="columns") # for each drink, where it happens
overall = pd.crosstab(visits["daypart"], visits["drink"], normalize=True)      # overall %

print("Counts:\n", counts, "\n")
print("Row % (normalize='index'):\n", row_pct.round(3), "\n")
print("Col % (normalize='columns'):\n", col_pct.round(3), "\n")
print("Overall % (normalize=True):\n", overall.round(3))

Counts:
 drink    Beer  Cocktail  Wine
daypart                      
Dinner      1         1     1
Late        1         2     0
Lunch       2         0     1 

Row % (normalize='index'):
 drink     Beer  Cocktail   Wine
daypart                        
Dinner   0.333     0.333  0.333
Late     0.333     0.667  0.000
Lunch    0.667     0.000  0.333 

Col % (normalize='columns'):
 drink    Beer  Cocktail  Wine
daypart                      
Dinner   0.25     0.333   0.5
Late     0.25     0.667   0.0
Lunch    0.50     0.000   0.5 

Overall % (normalize=True):
 drink     Beer  Cocktail   Wine
daypart                        
Dinner   0.111     0.111  0.111
Late     0.111     0.222  0.000
Lunch    0.222     0.000  0.111

Merge, Stack, Concat

import pandas as pd

orders = pd.DataFrame({
    "order_id": [1, 2, 3],
    "customer_id": [10, 20, 10],
    "total": [45.0, 23.5, 67.0]
})

customers = pd.DataFrame({
    "customer_id": [10, 20, 30],
    "name": ["Alice", "Bob", "Charlie"],
    "city": ["NYC", "LA", "SF"]
})

items = pd.DataFrame({
    "order_id": [1, 1, 2, 3, 3],
    "item": ["Beer", "Burger", "Beer", "Wine", "Cheese"],
    "price": [8, 15, 8, 12, 10]
})

######################
# Merge
######################
orders_with_customers = orders.merge(
    customers,
    on="customer_id",
    how="left" # inner, right, outer
)

print(orders_with_customers)
print("")

######################
# Vertical stacking with concat
######################
orders_day_1 = orders.iloc[:2]
orders_day_2 = orders.iloc[2:]

all_orders = pd.concat([orders_day_1, orders_day_2])
print(all_orders)
print("")

######################
# Horizontal stacking with concat
######################
order_totals = orders[["order_id", "total"]]
order_flags = pd.DataFrame({
    "vip": [True, False, True]
})

combined = pd.concat([order_totals, order_flags], axis=1)
print(combined)
print("")

######################
# Stacking - converting columns to rows
######################

# create df with cities as index and two product columns with sales data
sales = pd.DataFrame({
    "Beer": [120, 90],
    "Wine": [80, 110]
}, index=["NYC", "LA"])

# actually name the index and columns
sales.index.name = "city"
sales.columns.name = "product"
print(sales)
print("")

# only one column
print(sales.stack().reset_index(name="sales"))
print("")

# in one line. long means that we are reformatting from columns to rows (wide -> long)
long = (
    sales.rename_axis(index="city", columns="product")
         .stack()
         .reset_index(name="sales")
)

   order_id  customer_id  total   name city
0         1           10   45.0  Alice  NYC
1         2           20   23.5    Bob   LA
2         3           10   67.0  Alice  NYC

   order_id  customer_id  total
0         1           10   45.0
1         2           20   23.5
2         3           10   67.0

   order_id  total    vip
0         1   45.0   True
1         2   23.5  False
2         3   67.0   True

product  Beer  Wine
city               
NYC       120    80
LA         90   110

  city product  sales
0  NYC    Beer    120
1  NYC    Wine     80
2   LA    Beer     90
3   LA    Wine    110

Rolling Windows

Sometimes you want to look back 5 mins (or any other time span) and do some calculations.

Always make sure that the data is sorted!

import pandas as pd

data = [
    {"vehicle_id": "V1", "event_ts": "2024-01-01 08:00:00", "speed": 35.0},
    {"vehicle_id": "V1", "event_ts": "2024-01-01 08:02:00", "speed": 38.5},
    {"vehicle_id": "V1", "event_ts": "2024-01-01 08:04:30", "speed": 36.2},
    {"vehicle_id": "V1", "event_ts": "2024-01-01 08:07:00", "speed": 40.1},
    {"vehicle_id": "V1", "event_ts": "2024-01-01 08:10:00", "speed": 42.0},

    {"vehicle_id": "V2", "event_ts": "2024-01-01 08:01:00", "speed": 28.0},
    {"vehicle_id": "V2", "event_ts": "2024-01-01 08:03:00", "speed": 30.5},
    {"vehicle_id": "V2", "event_ts": "2024-01-01 08:06:00", "speed": 29.8},
    {"vehicle_id": "V2", "event_ts": "2024-01-01 08:09:30", "speed": 31.2},
    {"vehicle_id": "V2", "event_ts": "2024-01-01 08:12:00", "speed": 33.0},
]

df = pd.DataFrame(data)
df["event_ts"] = pd.to_datetime(df["event_ts"])
df = df.sort_values(by=["vehicle_id", "event_ts"])

result = (
    df.set_index(keys="event_ts")
    .groupby(by="vehicle_id")["speed"]
    .rolling(pd.Timedelta(minutes=5))
    .mean()
    .reset_index(name="speed_5min_avg")
)

# vehicle_id    event_ts    speed_5min_avg
# 0 V1  2024-01-01 08:00:00 35.000000
# 1 V1  2024-01-01 08:02:00 36.750000
# 2 V1  2024-01-01 08:04:30 36.566667
# 3 V1  2024-01-01 08:07:00 38.150000
# 4 V1  2024-01-01 08:10:00 41.050000
# 5 V2  2024-01-01 08:01:00 28.000000
# 6 V2  2024-01-01 08:03:00 29.250000
# 7 V2  2024-01-01 08:06:00 30.150000
# 8 V2  2024-01-01 08:09:30 30.500000
# 9 V2  2024-01-01 08:12:00 32.100000

Accessing previous or future values

Use pandas's shift() to access a previous row is essential in time series calculation.

import pandas as pd

df = pd.DataFrame({
    "vehicle_id": ["A", "A", "A", "B", "B"],
    "event_ts": pd.to_datetime([
        "2024-01-01 08:00",
        "2024-01-01 08:01",
        "2024-01-01 08:02",
        "2024-01-01 09:00",
        "2024-01-01 09:01",
    ]),
    "speed": [30, 32, 31, 50, 55]
})

df = df.sort_values(["vehicle_id", "event_ts"])

df["prev_speed"] = df.groupby("vehicle_id")["speed"].shift(1)
df["speed_change"] = df["speed"] - df["prev_speed"]

  vehicle_id            event_ts  speed  prev_speed  speed_change
0          A 2024-01-01 08:00:00     30         NaN           NaN
1          A 2024-01-01 08:01:00     32        30.0           2.0
2          A 2024-01-01 08:02:00     31        32.0          -1.0
3          B 2024-01-01 09:00:00     50         NaN           NaN
4          B 2024-01-01 09:01:00     55        50.0           5.0

Series

From dict

# states will be index
d = {'Ohio' : 35000 , 'Texas' : 71000 , 'Oregon' : 16000 , 'Utah' : 5000 }
s = pd.Series(d)

Time Series with data

# 12 hrs
dates = pd.date_range('2016/10/29 5:30pm', periods=12, freq='h')
temperatures = [gauss(mu=20, sigma=6) for i in range(12)]
pd.Series(data=temperatures, index=dates)

Reindex

Use reindex() when the shape of an operation is correct, but the index is incomplete.

Operations that create Series are usually group_by, value_counts, resample, pivot_table, etc

# The index are dates
ts = pd.Series(
    [10, 12, 15],
    index=pd.to_datetime(["2025-01-01", "2025-01-03", "2025-01-06"])
)

print(ts)

# fill in the missing dates 
full_index = pd.date_range("2025-01-01", "2025-01-06", freq="D")

ts = ts.reindex(full_index, fill_value=0)
print(ts)

Horizontal stacking

# horizontal stacking
a = pd.Series(range(0,8,1))
b = pd.Series(range(9,17,1))
c = pd.Series(range(18,26,1))
pd.concat([a,b,c],axis=1, keys=['a','b','c'])

MultiIndex

index = pd.MultiIndex.from_tuples(
    [("one", "a"), ("one", "b"), ("two", "a"), ("two", "b")]
)
s = pd.Series(np.arange(1.0, 5.0), index=index)

Common ML operations

Get all numerical attributes

df_numerical = df.select_dtypes(include=[np.number])

Use Cases

Download tables from wikipedia

from io import StringIO

import pandas as pd

import requests

url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/121.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

resp = requests.get(url, headers=headers, timeout=30)
resp.raise_for_status()

html = StringIO(resp.text)
# note the flavor! pandas prefer lxml for some reason.
tables = pd.read_html(html, flavor="bs4")

print("Num Tables:", len(tables))
for i, t in enumerate(tables):
    print(i, t.shape)

Merge Metadata with Data

import pandas as pd

# Meta information about the measurement columns in the values' dataframe
df_meta = pd.DataFrame({
    "column_name": ["speed", "rpm", "temp"],
    "display_name": ["Vehicle Speed", "Engine RPM", "Coolant Temp"],
    "unit": ["mph", "rpm", "°C"],
    "sensor_type": ["motion", "engine", "engine"],
})

# the actual values. Each column (speed, rpm, and temp) is explained in meta dataframe
df_values = pd.DataFrame({
    "timestamp": pd.to_datetime(["2026-01-01 08:00:00", "2026-01-01 08:01:00"]),
    "speed": [55, 57],
    "rpm": [2200, 2300],
    "temp": [90, 91],
})

# 1. create "long" from the values
df_long = df_values.melt(
    id_vars=["timestamp"],
    var_name="column_name",
    value_name="value",
)

# 2. merge metadata onto each measurement ro
df_long = df_long.merge(df_meta, on="column_name", how="left")

# df_long now has: timestamp, column_name, value, display_name, unit, sensor_type

# 3. pivot back to wide using metadata as the "header"
# Choose which metadata fields you want as your multi-row header:
header_levels = ["display_name", "unit"]  # or ["sensor_type", "display_name", "unit"]

df_wide = (
    df_long
    .pivot(index="timestamp", columns=header_levels, values="value")
    .sort_index(axis=1)
    .reset_index()
)

df_wide

display_name           timestamp Coolant Temp Engine RPM Vehicle Speed
unit                                       °C        rpm           mph
0            2026-01-01 08:00:00           90       2200            55
1            2026-01-01 08:01:00           91       2300            57