Database Leetcode Examples

I went through a bunch of leetcode database problems and tried to solve in pandas, polars, and sql as interview prep. I thought that someone might find it useful to have full code examples from table creation to slicing operations, so here they are. A random note is that AI is very useful here for formatting the notebooks and turning tables into the code to create those tables in pandas/polars (not for giving me the solutions! bad!).

Fraction logged in a day after
Product sales analysis III
Monthly Transactions I
Customers Who Bought All Products
Consecutive Numbers
Product Price at a Given Date
Last Person to Fit in the Bus
Salary categories
Exchange Seats
Movie Ratings
Second Highest Salary
Patient Conditions
Department Top Three Salaries
Investments in 2016
Friend Requests II - Who Has the Most Friends?
Restaurant Growth
Rank Scores
Department Highest Salary
Trips and Users
Game Play Analysis IV
Tree Node
Swap Gender
Market Analysis I
Analyze Organization Hierarchy

import pandas as pd
import polars as pl
import polars.selectors as cs

import numpy as np
import math
import random
import duckdb

# https://leetcode.com/problems/game-play-analysis-iv/description/
# Write a solution to report the fraction of players that logged in again on the day after the day they first logged in, rounded to 2 decimal places. In other words, you need to determine the number of players who logged in on the day immediately following their initial login, and divide it by the number of total players.

# +-----------+-----------+------------+--------------+
# | player_id | device_id | event_date | games_played |
# +-----------+-----------+------------+--------------+
# | 1         | 2         | 2016-03-01 | 5            |
# | 1         | 2         | 2016-03-02 | 6            |
# | 2         | 3         | 2017-06-25 | 1            |
# | 3         | 1         | 2016-03-02 | 0            |
# | 3         | 4         | 2018-07-03 | 5            |
# +-----------+-----------+------------+--------------+

# create both data frames
data = {
    "player_id": [1, 1, 2, 3, 3],
    "device_id": [2, 2, 3, 1, 4],
    "event_date": ["2016-03-01", "2016-03-02", "2017-06-25", "2016-03-02", "2018-07-03"],
    "games_played": [5, 6, 1, 0, 5],
}
df_pandas = pd.DataFrame(data)
df_polars = pl.DataFrame(data)

pandas solution

# in pandas
result = df_pandas.sort_values(['player_id', 'event_date']).groupby("player_id").first().reset_index()
result = pd.merge(df_pandas, result, on = 'player_id')
# subtract the dates as strings by day
result['days_diff'] = (pd.to_datetime(result['event_date_x']) - pd.to_datetime(result['event_date_y'])).dt.days

sum(result['days_diff'] == 1) / (df_pandas.drop_duplicates("player_id").shape[0])

polars solution

# in polars
(
    df_polars
    .with_columns(pl.col('event_date').str.to_date())
    .with_columns((pl.col('event_date').shift(-1) - pl.col('event_date')).dt.total_days().alias("diff"))
    .filter(pl.col("diff") == 1).shape[0]    
) / df_polars.unique("player_id").shape[0]

# https://leetcode.com/problems/product-sales-analysis-iii/description/
# Write a solution to find all sales that occurred in the first year each product was sold.

# Create the first table in pandas and polars
sales_data = {
    "sale_id": [1, 2, 7],
    "product_id": [100, 100, 200],
    "year": [2008, 2009, 2011],
    "quantity": [10, 12, 15],
    "price": [5000, 5000, 9000],
}
sales_df_pandas = pd.DataFrame(sales_data)
sales_df_polars = pl.DataFrame(sales_data)

# Create the second table in pandas and polars (dummy...not sure why this is here)
# products_data = {
#     "product_id": [100, 200, 300],
#     "product_name": ["Nokia", "Apple", "Samsung"],
# }
# products_df_pandas = pd.DataFrame(products_data)
# products_df_polars = pl.DataFrame(products_data)

pandas solution

sales_df_pandas['mindate'] = sales_df_pandas.groupby("product_id")['year'].transform("min")

sales_df_pandas[sales_df_pandas['mindate'] == sales_df_pandas['year']]

polars solution

# with polars....so much better
(
    sales_df_polars
    .with_columns(mindate = pl.col('year').min().over("product_id"))
    .filter(pl.col('mindate') == pl.col('year'))
)

sql solution

duckdb.sql(
    """
    with cte as (
        select product_id, min(year) as minyear
        from sales_df_pandas
        group by product_id
    )

    select product_id, year
    from (
    sales_df_pandas as p
    left join cte
    on p.product_id = cte.product_id
    ) as t
    where year = minyear
    """
)

## https://leetcode.com/problems/monthly-transactions-i/description
# Write an SQL query to find for each month and country, the number of transactions and their total amount, the number of approved transactions and their total amount.

transactions_data = {
    "id": [121, 122, 123, 124, 125, 126],
    "country": ["US", "US", "US", "DE", "FR", "FR"],
    "state": ["approved", "declined", "approved", "approved", "declined", "declined"],
    "amount": [1000, 2000, 2000, 2000, 1500, 1800],
    "trans_date": ["2018-12-18", "2018-12-19", "2019-01-01", "2019-01-07", "2019-01-15", "2019-01-20"],
}
transactions_df_pandas = pd.DataFrame(transactions_data)
transactions_df_polars = pl.DataFrame(transactions_data)

pandas solution for Monthly Transactions

# Find for each month and country, the number of transactions and their total amount, the number of approved transactions and their total amount.

# get the date at the month level
transactions_df_pandas['yearmonth'] = pd.to_datetime(transactions_df_pandas['trans_date']).dt.to_period("M")

# group by month and country as requested, and then get the total number of transactions and then the total amount of all transactions
all_trans = transactions_df_pandas.groupby(['country', 'yearmonth'], as_index = False)['amount'].sum()
all_counts = transactions_df_pandas.groupby(['country', 'yearmonth'], as_index=False)['amount'].count()

# do the same but with only approved transactions...what about 
approved_trans = transactions_df_pandas[transactions_df_pandas['state'] == 'approved'].groupby(['country', 'yearmonth'], as_index=False)['amount'].sum()
approved_counts = transactions_df_pandas[transactions_df_pandas['state'] == 'approved'].groupby(['country', 'yearmonth'], as_index=False)['amount'].count()

# merge everything
out_df = pd.merge(all_trans, all_counts, how = 'left', on = ['country', 'yearmonth'])
out_df = pd.merge(out_df, approved_trans, how = 'left', on = ['country', 'yearmonth'])
out_df = pd.merge(out_df, approved_counts, how = 'left', on = ['country', 'yearmonth'], suffixes=['a', 'b'])

out_df = out_df.fillna(0)

out_df = out_df.rename({
    'amount_x': 'trans_total_amount',
    "amount_y": "trans_count",
    "amounta": 'approved_total_amount',
    "amountb": 'approved_count',
    "yearmonth":'month'
}, axis=1)

# select the renamed columns
out_df[['country', 'month', 'trans_count', 'approved_count', 'trans_total_amount', 'approved_total_amount']]

polars solution for Monthly Transactions

# polars with .filter() retains entries with zero counts, nice!
(
    transactions_df_polars
    .with_columns(
        pl.col("trans_date").str.to_date().dt.month_start().alias("yearmonth")
    )
    .group_by(['country', 'yearmonth'])
    .agg(
        pl.col("amount").sum().alias("wooo"),
        pl.col("amount").filter(pl.col("state") == "approved").sum().alias("asdf"),
        pl.col("amount").count().alias("trans_count"),
        pl.col("amount").filter(pl.col("state") == "approved").count().alias("approved_count")
    )
)

# https://leetcode.com/problems/customers-who-bought-all-products/description/

# Write a solution to report the customer ids from the Customer table that bought all the products in the Product table.

# Create the Customer table in pandas
customer_data = {
    "customer_id": [1, 2, 3, 3, 1],
    "product_key": [5, 6, 5, 6, 6],
}
customers_pd = pd.DataFrame(customer_data)

# Create the Product table in pandas
product_data = {
    "product_key": [5, 6],
}
product_pd = pd.DataFrame(product_data)

# Create the Customer table in polars
customer_pl = pl.DataFrame(customer_data)

# Create the Product table in polars
product_pl = pl.DataFrame(product_data)

pandas solution

n_products = product_pd.shape[0]

out_df = customers_pd.groupby("customer_id", as_index=False)['product_key'].nunique()
out_df[out_df['product_key'] == n_products]['customer_id']

polars solution

(
    customer_pl
    .group_by("customer_id")
    .agg(pl.col("product_key").n_unique().alias("n_products"))
    .filter(pl.col('n_products') == n_products)
    .select("customer_id")
)

# https://leetcode.com/problems/consecutive-numbers/?envType=study-plan-v2&envId=top-sql-50

# Find all numbers that appear at least three times consecutively.

consecutive_data = {
    "id": [1, 2, 3, 4, 5, 6, 7],
    "num": [1, 1, 1, 2, 1, 2, 2],
}
df_pd = pd.DataFrame(consecutive_data)
df_pl = pl.DataFrame(consecutive_data)

pandas solution

# create shifted versions of each dataframe
df_pd['s1'] = df_pd['num'].shift(-1)
df_pd['s2'] = df_pd['num'].shift(-2)

df_pd[(df_pd['num'] == df_pd['s1']) & (df_pd['s1'] == df_pd['s2'])]['id']

polars solution

(
    df_pl
    .with_columns(
        pl.col("num").shift(-1).alias('num2'),
        pl.col("num").shift(-2).alias('num3')
    )
    .filter(pl.col("num") == pl.col('num2'), pl.col('num2') == pl.col('num3'))
    .select('id')
)

sql solution

duckdb.sql(
    """
    select distinct(a.num)
    from df_pd a, df_pd b, df_pd c
    where a.id = b.id - 1 and b.id = c.id - 1 and a.num = b.num and b.num = c.num
    """
)

# https://leetcode.com/problems/product-price-at-a-given-date/?envType=study-plan-v2&envId=top-sql-50
# Initially, all products have price 10.
# Write a solution to find the prices of all products on the date 2019-08-16

price_change_data = {
    "product_id": [1, 2, 1, 1, 2, 3],
    "new_price": [20, 50, 30, 35, 65, 20],
    "change_date": [
        "2019-08-14",
        "2019-08-14",
        "2019-08-15",
        "2019-08-16",
        "2019-08-17",
        "2019-08-18",
    ],
}
pc_pd = pd.DataFrame(price_change_data)
pc_pl = pl.DataFrame(price_change_data)

pandas solution

# make a column indicating if an entry is before the indicated date
pc_pd['is_before'] = pc_pd['change_date'] <= '2019-08-16'

# get the last change date for each product (this may not exist)
max_change = pc_pd[pc_pd['is_before'] == True].groupby("product_id", as_index = False)['change_date'].max()

# merge and only keep entries for the last change (could be NA)
pc_pd = pc_pd.merge(max_change, how = 'left', on = 'product_id')
pc_pd = pc_pd[(pc_pd['change_date_x'] == pc_pd['change_date_y']) | pc_pd['change_date_y'].isna()]

# replace the na prices with 10, since we assume the initial price is 10 
pc_pd['cur_price'] = pc_pd.apply(lambda row: row['new_price'] if pd.notna(row['change_date_y']) else 10, axis=1)

pc_pd[['product_id', 'cur_price']]

polars solution

(
    pc_pl
    .group_by("product_id")
    .agg(
        pl.col("change_date").filter(pl.col("change_date") <= '2019-08-16').max()
    ).join(
        pc_pl,
        how = 'left',
        on = ['product_id', 'change_date']
    )
    .with_columns(
        pl.when(pl.col("new_price").is_null()).then(10)
        .otherwise(pl.col('new_price'))
        .alias("date_price")
    )
    .select(['product_id', 'date_price'])
)

# https://leetcode.com/problems/last-person-to-fit-in-the-bus/description
# There is a queue of people waiting to board a bus. However, the bus has a weight limit of 1000 kilograms, so there may be some people who cannot board.

# Write a solution to find the person_name of the last person that can fit on the bus without exceeding the weight limit. The test cases are generated such that the first person does not exceed the weight limit.

# Note that only one person can board the bus at any given turn.

bus_data = {
    "person_id": [5, 4, 3, 6, 1, 2],
    "person_name": ["Alice", "Bob", "Alex", "John Cena", "Winston", "Marie"],
    "weight": [250, 175, 350, 400, 500, 200],
    "turn": [1, 5, 2, 3, 6, 4],
}
bus_df_pd = pd.DataFrame(bus_data)
bus_df_pl = pl.DataFrame(bus_data)

pandas solution for Last Person to Fit in the Bus

bus_df_pd['cumsum'] = bus_df_pd.sort_values("turn")['weight'].expanding().sum()

out_df = bus_df_pd[bus_df_pd['cumsum'] <= 1000]
out_df = out_df[out_df['turn'] == max(out_df['turn'])]

polars solution for Last Person to Fit in the Bus

(
    bus_df_pl
    .sort("turn")
    .with_columns(
        cumsum = pl.col("weight").cum_sum()
    )
    .filter(pl.col("cumsum") <= 1000)
    .filter(pl.col("cumsum") == pl.col("cumsum").max())
)

# https://leetcode.com/problems/count-salary-categories/description/
# Write a solution to calculate the number of bank accounts for each salary category. The salary categories are:

# "Low Salary": All the salaries strictly less than $20000.
# "Average Salary": All the salaries in the inclusive range [$20000, $50000].
# "High Salary": All the salaries strictly greater than $50000.
# The result table must contain all three categories. If there are no accounts in a category, return 0.

income_data = {
    "account_id": [3, 2, 8, 6],
    "income": [108939, 12747, 87709, 91796],
}
income_df_pd = pd.DataFrame(income_data)

income_df_pl = pl.DataFrame(income_data)

# Additional larger example with 20 rows
large_income_data = {
    "account_id": list(range(1, 21)),
    "income": [
        15000, 25000, 35000, 45000, 55000, 65000, 75000, 85000, 95000, 105000,
        115000, 125000, 135000, 145000, 155000, 165000, 175000, 185000, 195000, 205000
    ],
}
large_income_df_pd = pd.DataFrame(large_income_data)
large_income_df_pl = pl.DataFrame(large_income_data)

pandas solution

categories_table = pd.DataFrame({"category":["Low Salary", "Average Salary", "High Salary"]})

income_df_pd['category'] = income_df_pd['income'].case_when(
    [
        (income_df_pd.eval("income < 20000"), "Low Salary"),
        (income_df_pd.eval("20000 <= income <= 50000"), "Average Salary"),
        (income_df_pd.eval("income > 50000"), "High Salary")


    ]
)

income_df_pd = income_df_pd.groupby("category", as_index=False)['income'].count().rename({"income":"accounts_count"}, axis=1)

out_df = categories_table.merge(income_df_pd, on = "category", how = "left")
out_df[out_df['accounts_count'].isna()] = 0

sql solution

duckdb.sql(
    """
    with categorytable as (
        select * from (values ('Low Salary'), ('Average Salary'), ('High Salary'))
        as t (category)
    ),
    salarytable as (
        select 
            account_id,
            case when income < 20000 then 'Low Salary'
            when income between 20000 and 50000 then 'Average Salary'
            else 'High Salary' end as category
        from large_income_df_pd
    )

    select t1.category, coalesce(ccount, 0) as ccount from
    (
        categorytable t1
        left join
        (
            select category, count(category) as ccount
            from salarytable
            group by category
        ) t2
        on t1.category = t2.category
    )
    """
)

polars solution

categories_table = pl.DataFrame({"category":["Low Salary", "Average Salary", "High Salary"]})

income_df_pl = large_income_df_pl

(
    income_df_pl
    .with_columns(
        pl.when(pl.col("income") < 20_000)
        .then(pl.lit("Low Salary"))
        .when(pl.col("income").is_between(20_000, 50_000))
        .then(pl.lit("Average Salary"))
        .otherwise(pl.lit("High Salary"))
        .alias("category")
    )
    .group_by("category")
    .len(name = "accounts_count")
    .join(
        categories_table,
        how = "right",
        on = "category"
    )
    .with_columns(
        accounts_count = pl.when(pl.col("accounts_count").is_null()).then(0).otherwise(pl.col("accounts_count"))
    )
)

# https://leetcode.com/problems/exchange-seats/description/?envType=study-plan-v2&envId=top-sql-50
# Write a solution to swap the seat id of every two consecutive students. If the number of students is odd, the id of the last student is not swapped.

seats_data = {
    "id": [1, 2, 3, 4, 5],
    "student": ["Abbot", "Doris", "Emerson", "Green", "Jeames"],
}
seats_df_pd = pd.DataFrame(seats_data)

seats_df_pl = pl.DataFrame(seats_data)

pandas solution

# create a new id where every consecutive id is swapped (e.g. reverse 1 and 2, 3 and 4, etc.)
join_df = seats_df_pd.copy()
join_df['swapped'] = join_df['id'].transform(lambda x: x + 1 if x%2 == 1 else x-1)

# join on the id, which will have the swapped names aligned with the original ids
result = seats_df_pd.merge(
    join_df,
    how = 'left',
    left_on='id',
    right_on='swapped'
)

# take care of nas due to e.g. 5 getting mapped to 6 in a 5 row table
result.loc[result['student_y'].isna(), 'student_y'] = result.loc[result['student_y'].isna(), 'student_x']
result[['id_x', 'student_y']].rename(columns={'id_x': 'id', 'student_y': 'student'})

polars solution

(
    seats_df_pl
    .with_columns(
        swapped = pl.when(pl.col('id') % 2 == 1)
        .then(pl.col('id') + 1)
        .otherwise(pl.col('id') - 1)
    )
    .join(
        seats_df_pl,
        how = 'left',
        left_on='swapped',
        right_on='id'
    )
    .with_columns(
        student = pl.when(pl.col('student_right').is_null())
        .then(pl.col('student'))
        .otherwise(pl.col('student_right'))
    )
    .select('id', 'student')
)

sql solution

duckdb.sql(
    """
    with swapped as (
        select case
            when id % 2 = 1 then id + 1
            else id - 1
        end as swapped, student
        from seats_df_pd
    )

    select
        id,
        coalesce(s2, s1) as student    
    from 
    (
        select id, t.student as s1, s.student as s2
        from
        seats_df_pd t
        left join
        swapped s
        on t.id = s.swapped   
    )
    """
)

# https://leetcode.com/problems/movie-rating/description/
# Write a solution to:

# Find the name of the user who has rated the greatest number of movies. In case of a tie, return the lexicographically smaller user name.
# Find the movie name with the highest average rating in February 2020. In case of a tie, return the lexicographically smaller movie name.

# Create the Movies table in pandas
movies_data = {
    "movie_id": [1, 2, 3],
    "title": ["Avengers", "Frozen 2", "Joker"],
}
movies_df_pd = pd.DataFrame(movies_data)

# Create the Users table in pandas
users_data = {
    "user_id": [1, 2, 3, 4],
    "name": ["Daniel", "Monica", "Maria", "James"],
}
users_df_pd = pd.DataFrame(users_data)

# Create the MovieRating table in pandas
movie_rating_data = {
    "movie_id": [1, 1, 1, 1, 2, 2, 2, 3, 3],
    "user_id": [1, 2, 3, 4, 1, 2, 3, 1, 2],
    "rating": [3, 4, 2, 1, 5, 2, 2, 3, 4],
    "created_at": [
        "2020-01-12", "2020-02-11", "2020-02-12", "2020-01-01",
        "2020-02-17", "2020-02-01", "2020-03-01", "2020-02-22", "2020-02-25"
    ],
}
movie_rating_df_pd = pd.DataFrame(movie_rating_data)

# Create the Movies table in polars
movies_df_pl = pl.DataFrame(movies_data)

# Create the Users table in polars
users_df_pl = pl.DataFrame(users_data)

# Create the MovieRating table in polars
movie_rating_df_pl = pl.DataFrame(movie_rating_data)

pandas solution for Movie Ratings

# ratings_cts = movie_rating_df_pd.groupby("user_id", as_index=False).count()

# top_name = users_df_pd.merge(ratings_cts, on = "user_id").sort_values(['movie_id', 'name'], ascending=[False, True])['name'][0]

# ratings_feb = movie_rating_df_pd[pd.to_datetime(movie_rating_df_pd['created_at']).dt.month == 2]

# avg_rating = ratings_feb.groupby("movie_id")['rating'].mean().reset_index()
# avg_rating = avg_rating.merge(movies_df_pd, on = "movie_id")
# avg_rating.sort_values(['rating', 'title'], ascending=[False, True])['title'].to_list()[0]

mv_df = movie_rating_df_pd.merge(
    users_df_pd,
    on = "user_id" 
)

count_df = mv_df.groupby(['user_id', 'name'], as_index=False)['movie_id'].nunique().sort_values('movie_id', ascending=False)

# power_user = count_df[count_df['movie_id'] == max(count_df['movie_id'])].sort_values('name')['name'][0]
power_user = count_df.sort_values(['movie_id', 'name'], ascending=[False, True])['name'][0]

mv_df = movie_rating_df_pd.merge(
    movies_df_pd,
    on='movie_id'
)

avg_df = mv_df[pd.to_datetime(mv_df['created_at']).dt.month == 2].groupby("title", as_index=False)['rating'].agg("mean")

top_movie = avg_df.sort_values(["rating", 'title'],ascending=[False, True])['title'].to_list()[0]

polars solution for Movie Ratings

(
    movie_rating_df_pl
    .join(
        users_df_pl,
        on = "user_id" 
    )
    .group_by(["user_id", "name"])
    .agg(pl.col("movie_id").n_unique())
    .filter(pl.col('movie_id') == pl.col('movie_id').max())
    .sort("name")
    .select("name")
)[0]

(
    movie_rating_df_pl
    .join(
        movies_df_pl,
        on = "movie_id" 
    )
    .with_columns(
        pl.col("created_at").str.to_date()
    )
    .filter(pl.col("created_at").dt.month() == 2, pl.col("created_at").dt.year() == 2020)
    .group_by('movie_id')
    .agg(pl.col("rating").mean(), pl.col("title").first())
    .filter(pl.col("rating") == pl.col("rating").max())
    .sort("title")
)['title'][0]

sql solution

duckdb.sql(
    """
    with full_tbl as (
        select * from 
        movie_rating_df_pd
        left join
        movies_df_pd
        on movie_rating_df_pd.movie_id = movies_df_pd.movie_id
        left join
        users_df_pd
        on movie_rating_df_pd.user_id = users_df_pd.user_id
    )

    select * from
    (
        select name from 
            (
                select user_id, name, count(user_id) as ct
                from
                full_tbl
                group by user_id, name
            )
        order by ct desc, name
        limit 1
    )
    union
    (
        select title from 
        (
            select movie_id, title, avg(rating) as avg_rating
            from full_tbl
            where date_part('month', created_at::date) = 2 and date_part('year', created_at::date) = 2020
            group by movie_id, title
        )
        order by avg_rating desc, title
        limit 1
    )
    """
)

# https://leetcode.com/problems/second-highest-salary/description
# Write a solution to find the second highest distinct salary from the Employee table. If there is no second highest salary, return null (return None in Pandas).

# +----+--------+
# | id | salary |
# +----+--------+
# | 1  | 100    |
# | 2  | 200    |
# | 3  | 300    |
# +----+--------+

salary_data = {
    "id": list(range(1, 21)),
    "salary": [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000,
               11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 18000, 20000],
}

# Create the DataFrame in pandas
salary_df_pd = pd.DataFrame(salary_data)

# Create the DataFrame in polars
salary_df_pl = pl.DataFrame(salary_data)

pandas solution for Second Highest Salary

# Check if there are at least 2 unique salaries
unique_salaries = salary_df_pd['salary'].unique()
if len(unique_salaries) >= 2:
    second_highest = sorted(unique_salaries)[-2]
else:
    second_highest = None

print(f"Second highest salary: {second_highest}")

polars solution for Second Highest Salary

# Check if there are at least 2 unique salaries
unique_salaries = salary_df_pl['salary'].unique()
if len(unique_salaries) >= 2:
    second_highest = salary_df_pl.sort("salary").unique('salary')['salary'][-2]
else:
    second_highest = None

print(f"Second highest salary: {second_highest}")

sql solution for Second Highest Salary

duckdb.sql(
    """
    with rankedsalary as 
    (
        select salary, rank() over (order by salary desc) as drank
        from salary_df_pd
    )

    select 
        case when exists (select 1 from rankedsalary where drank = 2) 
        then (
            select salary 
            from rankedsalary 
            where drank = 2 
            limit 1
        )
        else NULL
        end as SecondHighestSalary
    """
)

## https://leetcode.com/problems/patients-with-a-condition/description/
# Write a solution to find the patient_id, patient_name, and conditions of the patients who have Type I Diabetes. Type I Diabetes always starts with DIAB1 prefix.

# +------------+--------------+--------------+
# | patient_id | patient_name | conditions   |
# +------------+--------------+--------------+
# | 1          | Daniel       | YFEV COUGH   |
# | 2          | Alice        |              |
# | 3          | Bob          | DIAB100 MYOP |
# | 4          | George       | ACNE DIAB100 |
# | 5          | Alain        | DIAB201      |
# +------------+--------------+--------------+
patient_data = {
    "patient_id": [1, 2, 3, 4, 5, 6, 7, 8, 9],
    "patient_name": ["Daniel", "Alice", "Bob", "George", "Alain", "Emma", "Sophia", "Michael", "Sarah"],
    "conditions": ["YFEV COUGH", "", "DIAB100 MYOP", "ACNE DIAB100", "DIAB201", "ASTHMA", "HYPERTENSION", "DIAB100 HYPERTENSION", "DIAB100 ASTHMA"],
}
patients_df_pd = pd.DataFrame(patient_data)
patients_df_pl = pl.DataFrame(patient_data)

pandas solution for Patient Conditions

import re

patients_df_pd['has_diab'] = patients_df_pd['conditions'].apply(lambda x: re.search(r'^DIAB1|(?<=\s)DIAB1', x) is not None)

patients_df_pd[patients_df_pd['has_diab']]

polars solution for Patient Conditions

patients_df_pl.filter(
    pl.col("conditions").str.contains(pattern=r'^DIAB1|[\s]{1}DIAB1')
)

sql solution for Patient Conditions

duckdb.sql(
    """
    select * from patients_df_pd
    where 
        conditions ^@ 'DIAB1'
        or conditions LIKE '% DIAB1%'
    """
)

# https://leetcode.com/problems/department-top-three-salaries/

# A company's executives are interested in seeing who earns the most money in each of the company's departments. A high earner in a department is an employee who has a salary in the top three unique salaries for that department.

# Write a solution to find the employees who are high earners in each of the departments.

# Employee table:
# +----+-------+--------+--------------+
# | id | name  | salary | departmentId |
# +----+-------+--------+--------------+
# | 1  | Joe   | 85000  | 1            |
# | 2  | Henry | 80000  | 2            |
# | 3  | Sam   | 60000  | 2            |
# | 4  | Max   | 90000  | 1            |
# | 5  | Janet | 69000  | 1            |
# | 6  | Randy | 85000  | 1            |
# | 7  | Will  | 70000  | 1            |
# +----+-------+--------+--------------+
# Department table:
# +----+-------+
# | id | name  |
# +----+-------+
# | 1  | IT    |
# | 2  | Sales |
# +----+-------+
# Create the Employee table in pandas
employee_data = {
    "id": [1, 2, 3, 4, 5, 6, 7],
    "name": ["Joe", "Henry", "Sam", "Max", "Janet", "Randy", "Will"],
    "salary": [85000, 80000, 60000, 90000, 69000, 85000, 70000],
    "departmentId": [1, 2, 2, 1, 1, 1, 1],
}
employee_df_pd = pd.DataFrame(employee_data)

# Create the Department table in pandas
department_data = {
    "id": [1, 2],
    "name": ["IT", "Sales"],
}
department_df_pd = pd.DataFrame(department_data)

# Create the Employee table in polars
employee_df_pl = pl.DataFrame(employee_data)

# Create the Department table in polars
department_df_pl = pl.DataFrame(department_data)

pandas solution for Department Top Three Salaries

df_pd = employee_df_pd.merge(
    department_df_pd,
    left_on='departmentId',
    right_on='id'
)

df_pd['drank'] = df_pd.groupby("departmentId")['salary'].rank(method='dense')

df_pd[df_pd['drank'] <= 3][['name_y', 'name_x', 'salary']]

polars solution for Department Top Three Salaries

(
    employee_df_pl.join(
        department_df_pl,
        left_on='departmentId',
        right_on='id'
    )
    .with_columns(
        drank = pl.col('salary').rank("dense").over("departmentId")
    )
    .filter(pl.col("drank") <=3)
    .select(['name', 'salary', 'name_right'])
)

sql solution for Department Top Three Salaries

duckdb.sql(
    """
    with cte as (
        select e.name as "Employee", d.name as "Department", e.salary as "Salary", dense_rank() over(partition by d.name order by e.salary) as drank
        from
        employee_df_pd e
        left join
        department_df_pd d
        on e.departmentId = d.id
    )

    select "Department", "Employee", "Salary" from cte
    where drank < 4
    """
)

# https://leetcode.com/problems/investments-in-2016/description/

# Write a solution to report the sum of all total investment values in 2016 tiv_2016, for all policyholders who:

# have the same tiv_2015 value as one or more other policyholders, and
# are not located in the same city as any other policyholder (i.e., the (lat, lon) attribute pairs must be unique).
# Round tiv_2016 to two decimal places.

# The result format is in the following example.

# | pid | tiv_2015 | tiv_2016 | lat  | lon  |
# | --- | -------- | -------- | ---- | ---- |
# | 1   | 224.17   | 952.73   | 32.4 | 20.2 |
# | 2   | 224.17   | 900.66   | 52.4 | 32.7 |
# | 3   | 824.61   | 645.13   | 72.4 | 45.2 |
# | 4   | 424.32   | 323.66   | 12.4 | 7.7  |
# | 5   | 424.32   | 282.9    | 12.4 | 7.7  |
# | 6   | 625.05   | 243.53   | 52.5 | 32.8 |
# | 7   | 424.32   | 968.94   | 72.5 | 45.3 |
# | 8   | 624.46   | 714.13   | 12.5 | 7.8  |
# | 9   | 425.49   | 463.85   | 32.5 | 20.3 |
# | 10  | 624.46   | 776.85   | 12.4 | 7.7  |
# | 11  | 624.46   | 692.71   | 72.5 | 45.3 |
# | 12  | 225.93   | 933      | 12.5 | 7.8  |
# | 13  | 824.61   | 786.86   | 32.6 | 20.3 |
# | 14  | 824.61   | 935.34   | 52.6 | 32.8 |
# | 15  | 826.37   | 516.1    | 12.4 | 7.7  |
# | 16  | 824.61   | 374.5    | 12.6 | 7.9  |
# | 17  | 824.61   | 924.19   | 32.6 | 20.4 |
# | 18  | 626.81   | 897.47   | 52.6 | 32.9 |
# | 19  | 224.76   | 714.79   | 72.6 | 45.4 |
# | 20  | 224.76   | 681.53   | 12.4 | 7.7  |
# | 21  | 427.25   | 263.27   | 32.7 | 20.4 |
# | 22  | 224.76   | 671.8    | 52.7 | 32.9 |
# | 23  | 424.9    | 769.18   | 72.7 | 45.4 |
# | 24  | 227.69   | 830.5    | 12.7 | 7.9  |
# | 25  | 424.9    | 844.97   | 12.4 | 7.7  |
# | 26  | 424.9    | 733.35   | 52.7 | 32.9 |
# | 27  | 828.13   | 931.83   | 72.8 | 45.5 |
# | 28  | 625.05   | 659.13   | 12.8 | 8    |
# | 29  | 625.05   | 300.16   | 32.8 | 20.5 |

insurance_data = {
    "pid": [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23, 24, 25, 26, 27, 28, 29
    ],
    "tiv_2015": [
        224.17, 224.17, 824.61, 424.32, 424.32, 625.05, 424.32, 624.46, 425.49, 624.46,
        624.46, 225.93, 824.61, 824.61, 826.37, 824.61, 824.61, 626.81, 224.76, 224.76,
        427.25, 224.76, 424.9, 227.69, 424.9, 424.9, 828.13, 625.05, 625.05
    ],
    "tiv_2016": [
        952.73, 900.66, 645.13, 323.66, 282.9, 243.53, 968.94, 714.13, 463.85, 776.85,
        692.71, 933, 786.86, 935.34, 516.1, 374.5, 924.19, 897.47, 714.79, 681.53,
        263.27, 671.8, 769.18, 830.5, 844.97, 733.35, 931.83, 659.13, 300.16
    ],
    "lat": [
        32.4, 52.4, 72.4, 12.4, 12.4, 52.5, 72.5, 12.5, 32.5, 12.4,
        72.5, 12.5, 32.6, 52.6, 12.4, 12.6, 32.6, 52.6, 72.6, 12.4,
        32.7, 52.7, 72.7, 12.7, 12.4, 52.7, 72.8, 12.8, 32.8
    ],
    "lon": [
        20.2, 32.7, 45.2, 7.7, 7.7, 32.8, 45.3, 7.8, 20.3, 7.7,
        45.3, 7.8, 20.3, 32.8, 7.7, 7.9, 20.4, 32.9, 45.4, 7.7,
        20.4, 32.9, 45.4, 7.9, 7.7, 32.9, 45.5, 8, 20.5
    ],
}
insurance_df_pd = pd.DataFrame(insurance_data)

insurance_df_pl = pl.DataFrame(insurance_data)

polars for Investments in 2016

(
    insurance_df_pl
    .with_columns(
        n = pl.count().over("tiv_2015"),
        nlocs = pl.count().over(['lat', 'lon'])
    )
    .filter(pl.col("n") > 1, pl.col("nlocs") == 1)
    .sum()
)

pandas solution for Investments in 2016

insurance_df_pd['ct1'] = insurance_df_pd.groupby('tiv_2015')['pid'].transform('count')
insurance_df_pd['ct2'] = insurance_df_pd.groupby(['lat', 'lon'])['pid'].transform('count')

insurance_df_pd[(insurance_df_pd['ct1'] > 1) & (insurance_df_pd['ct2'] == 1)]['tiv_2016'].sum()

sql solution for Investments in 2016

duckdb.sql(
    """
    with cte as (
        select pid, tiv_2016,
        count(*) over (partition by tiv_2015) as tiv_15_ct,
        count(*) over (partition by lat, lon) as loc_count
        from insurance_df_pd
    )

    select round(sum(tiv_2016::numeric), 2) as tiv_2016
    from cte
    where tiv_15_ct > 1 and loc_count = 1
    """
)

# Write a solution to find the people who have the most friends and the most friends number.
# https://leetcode.com/problems/friend-requests-ii-who-has-the-most-friends/description/

# +--------------+-------------+-------------+
# | requester_id | accepter_id | accept_date |
# +--------------+-------------+-------------+
# | 1            | 2           | 2016/06/03  |
# | 1            | 3           | 2016/06/08  |
# | 2            | 3           | 2016/06/08  |
# | 3            | 4           | 2016/06/09  |
# +--------------+-------------+-------------+

friend_requests_data = {
    "requester_id": [1, 1, 2, 3],
    "accepter_id": [2, 3, 3, 4],
    "accept_date": ["2016/06/03", "2016/06/08", "2016/06/08", "2016/06/09"],
}
friend_requests_df_pd = pd.DataFrame(friend_requests_data)

# Add user IDs up to 10
additional_users = pd.DataFrame({
    "requester_id": list(range(5, 11)),
    "accepter_id": list(range(6, 12)),
    "accept_date": ["2016/06/10"] * 3 + ["2016/06/11"] * 3,
})
fr_df_pd = pd.concat([friend_requests_df_pd, additional_users], ignore_index=True)

friend_requests_df_pl = pl.DataFrame(friend_requests_data)

# Add user IDs up to 10 in polars
additional_users_pl = pl.DataFrame({
    "requester_id": list(range(5, 11)),
    "accepter_id": list(range(6, 12)),
    "accept_date": ["2016/06/10"] * 3 + ["2016/06/11"] * 3,
})
fr_df_pl = pl.concat([friend_requests_df_pl, additional_users_pl])

pandas solution for Friend requests II - Who Has the Most Friends?

requester_counts = fr_df_pd.groupby("requester_id", as_index=False)['accepter_id'].count().rename({'accepter_id':'req_counts'}, axis=1)
accepter_counts = fr_df_pd.groupby("accepter_id", as_index=False)['requester_id'].count().rename({'requester_id':'req_counts'}, axis=1)

all_counts = requester_counts.merge(
    accepter_counts,
    how = 'outer',
    left_on = 'requester_id',
    right_on = 'accepter_id',
)

all_counts['req_counts_x'][all_counts['req_counts_x'].isna()] = 0
all_counts['req_counts_y'][all_counts['req_counts_y'].isna()] = 0

all_counts['id'] = all_counts[['requester_id', 'accepter_id']].apply(lambda row: row['requester_id'] if not math.isnan(row['requester_id']) else row['accepter_id'], axis=1)
all_counts['all_counts'] = all_counts['req_counts_x'] + all_counts['req_counts_y']

sum_df = all_counts.groupby("id", as_index=False)['all_counts'].sum()

sum_df[sum_df['all_counts'] == max(sum_df['all_counts'])]

polars solution for Friend requests II - Who Has the Most Friends?

pl.DataFrame(pl.concat(
    [fr_df_pl['requester_id'],
    fr_df_pl['accepter_id']]
)).group_by("requester_id").len().filter(pl.col("len") == pl.col('len').max())

sql solution for Friend requests II - Who Has the Most Friends?

soln1 = """
    with accept_cts as
    (
        select accepter_id as id, count(accepter_id) as acc_ct
        from fr_df_pd
        group by accepter_id
    ),
    request_cts as 
    (
        select requester_id as id, count(requester_id) as req_ct
        from fr_df_pd
        group by requester_id
    )

    select coalesce(a.id, r.id) as id, (coalesce(acc_ct, 0) + coalesce(req_ct, 0)) as num from
    accept_cts a
    full join
    request_cts r
    on a.id = r.id
    order by (coalesce(acc_ct, 0) + coalesce(req_ct, 0)) desc
    limit 1
    """

soln2 = """
    WITH Combined AS
    (
        SELECT accepter_id, requester_id, accept_date FROM fr_df_pd
        UNION ALL
        (
            SELECT requester_id as accepter_id, accepter_id as requester_id, accept_date
            FROM fr_df_pd
        )
    ),
    FriendCounts AS
    (
        SELECT accepter_id as id, COUNT(*) as num
        FROM Combined
        GROUP BY accepter_id
    )

    SELECT * FROM FriendCounts 
    WHERE num = (SELECT MAX(num) FROM FriendCounts)
    """

duckdb.sql(
    soln2
)

duckdb.sql(
    soln1
)

# https://leetcode.com/problems/restaurant-growth/
# You are the restaurant owner and you want to analyze a possible expansion (there will be at least one customer every day).

# Compute the moving average of how much the customer paid in a seven days window (i.e., current day + 6 days before). average_amount should be rounded to two decimal places.

# Return the result table ordered by visited_on in ascending order.


# Create the Customer table in pandas
customer_data = {
    "customer_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 3],
    "name": ["Jhon", "Daniel", "Jade", "Khaled", "Winston", "Elvis", "Anna", "Maria", "Jaze", "Jhon", "Jade"],
    "visited_on": [
        "2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04", "2019-01-05",
        "2019-01-06", "2019-01-07", "2019-01-08", "2019-01-09", "2019-01-10", "2019-01-10"
    ],
    "amount": [100, 110, 120, 130, 110, 140, 150, 80, 110, 130, 150],
}
customer_df_pd = pd.DataFrame(customer_data)

# Create the Customer table in polars
customer_df_pl = pl.DataFrame(customer_data)

pandas solution for Restaurant Growth

customer_df_pd['visited_on'] = pd.to_datetime(customer_df_pd['visited_on'])
sum_df = customer_df_pd.groupby('visited_on', as_index=False)['amount'].sum()

sum_df['roll_avg'] = sum_df.rolling('7d', on = 'visited_on', min_periods=7)['amount'].mean()
sum_df['total_amount'] = sum_df.rolling('7d', on = 'visited_on', min_periods=7)['amount'].sum()

sum_df = sum_df[~sum_df['roll_avg'].isna()]
sum_df

polars solution for Restaurant Growth

(
    customer_df_pl
    .with_columns(pl.col('visited_on').str.to_datetime())
    .group_by("visited_on")
    .agg(
        daily_amount = pl.col("amount").sum()
    )
    .sort("visited_on")
    .rolling(index_column = 'visited_on', period='7d')
    .agg(
        pl.col("daily_amount").sum().alias("rolling_sum"),
        pl.col('daily_amount').mean().alias("rolling_mean"),
        pl.col('daily_amount').len().alias('len')
    )
    .filter(pl.col('len') == 7)
    .select(~cs.matches("len"))
)

sql solution for Restaurant Growth

duckdb.sql(
    """
    with dailysales as
    (
        select sum(amount) as daily_amount, visited_on, lag (visited_on, 6) over(order by visited_on) as day_lag
        from customer_df_pd
        group by visited_on
    ),
    rollingsales as (
        select 
            visited_on, 
            sum(daily_amount) over(order by visited_on rows between 6 preceding and current row) as amount,
            round(avg(daily_amount) over(order by visited_on rows between 6 preceding and current row), 2) as average_amount,
            day_lag
        from dailysales
    )

    select visited_on, amount, average_amount from rollingsales
    where day_lag is not null
    """
)

# https://leetcode.com/problems/rank-scores/description/
# Write a solution to find the rank of the scores. The ranking should be calculated according to the following rules:

# The scores should be ranked from the highest to the lowest.
# If there is a tie between two scores, both should have the same ranking.
# After a tie, the next ranking number should be the next consecutive integer value. In other words, there should be no holes between ranks.
# Return the result table ordered by score in descending order.

# +----+-------+
# | id | score |
# +----+-------+
# | 1  | 3.50  |
# | 2  | 3.65  |
# | 3  | 4.00  |
# | 4  | 3.85  |
# | 5  | 4.00  |
# | 6  | 3.65  |
# +----+-------+
# Generate additional data with random scores
additional_data = {
    "id": list(range(7, 21)),  # Adding IDs from 7 to 20
    "score": [round(random.uniform(3.0, 4.5), 2) for _ in range(7, 21)],
}

# Combine the original data with the additional data
score_data = {
    "id": [1, 2, 3, 4, 5, 6] + additional_data["id"],
    "score": [3.50, 3.65, 4.00, 3.85, 4.00, 3.65] + additional_data["score"],
}

score_df_pd = pd.DataFrame(score_data)

score_df_pl = pl.DataFrame(score_data)

pandas solution for Rank Scores

score_df_pd['rank'] = score_df_pd['score'].rank(ascending=False, method='dense')
score_df_pd.sort_values("score",ascending=False)

polars solution for Rank Scores

(
    score_df_pl
    .with_columns(
        rank = pl.col("score").rank("dense", descending=True)
    )
)

sql solution for Rank Scores

duckdb.sql(
    """
    select id, score, dense_rank() over(order by score desc) as rank
    from score_df_pl
    order by score desc
    """
)

# https://leetcode.com/problems/department-highest-salary/description/
# Write a solution to find employees who have the highest salary in each of the departments.

# +----+-------+--------+--------------+
# | id | name  | salary | departmentId |
# +----+-------+--------+--------------+
# | 1  | Joe   | 70000  | 1            |
# | 2  | Jim   | 90000  | 1            |
# | 3  | Henry | 80000  | 2            |
# | 4  | Sam   | 60000  | 2            |
# | 5  | Max   | 90000  | 1            |
# +----+-------+--------+--------------+
# Department table:
# +----+-------+
# | id | name  |
# +----+-------+
# | 1  | IT    |
# | 2  | Sales |
# +----+-------+

# Add more employees
additional_employees = {
    "id": [6, 7, 8, 9, 10],
    "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
    "salary": [75000, 85000, 95000, 72000, 88000],
    "departmentId": [2, 3, 3, 4, 4],
}

# Add more departments
additional_departments = {
    "id": [3, 4],
    "name": ["HR", "Marketing"],
}

# Combine the original and additional employee data
employee_data = {
    "id": [1, 2, 3, 4, 5] + additional_employees["id"],
    "name": ["Joe", "Jim", "Henry", "Sam", "Max"] + additional_employees["name"],
    "salary": [70000, 90000, 80000, 60000, 90000] + additional_employees["salary"],
    "departmentId": [1, 1, 2, 2, 1] + additional_employees["departmentId"],
}

# Combine the original and additional department data
department_data = {
    "id": [1, 2] + additional_departments["id"],
    "name": ["IT", "Sales"] + additional_departments["name"],
}

# Create the updated Employee table in pandas
employee_df_pd = pd.DataFrame(employee_data)

# Create the updated Department table in pandas
department_df_pd = pd.DataFrame(department_data)

# Create the updated Employee table in polars
employee_df_pl = pl.DataFrame(employee_data)

# Create the updated Department table in polars
department_df_pl = pl.DataFrame(department_data)

pandas solution for Department Highest Salary

out_df = employee_df_pd.merge(
    department_df_pd,
    left_on = 'departmentId',
    right_on = 'id'
)

out_df['rank'] = out_df.groupby("name_y")['salary'].rank(method='dense', ascending=False)
out_df.loc[out_df['rank'] == 1, ['id_x', 'name_y']]

polars solution for Department Highest Salary

(
    employee_df_pl.join(
        department_df_pl,
        left_on='departmentId',
        right_on='id'
    )
    .with_columns(
        rank = pl.col("salary").rank(method='dense', descending=True).over(pl.col("departmentId"))
    )
    .filter(pl.col("rank") == 1)
)

sql solution for Department Highest Salary

duckdb.sql(
"""
with cte as (
    select t1.name, t1.salary, t2.name as departmentname, dense_rank() over(partition by departmentId order by salary desc) as rank from
    employee_df_pl t1
    left join
    department_df_pl t2
    on t1.departmentId = t2.id
)

select * from cte
where rank = 1

"""
)

duckdb.sql(
    """
    with ranktable as (
    select id, name, salary, departmentId, dense_rank() over(partition by departmentId order by salary desc) as rank
    from employee_df_pd
    )

    select department_df_pd.name, max(salary) max_salary from
    ranktable
    left join 
    department_df_pd
    on ranktable.departmentId = department_df_pd.id
    where rank = 1
    group by department_df_pd.name
    """
)

# https://leetcode.com/problems/trips-and-users/description/

# The cancellation rate is computed by dividing the number of canceled (by client or driver) requests with unbanned users by the total number of requests with unbanned users on that day.

# Write a solution to find the cancellation rate of requests with unbanned users (both client and driver must not be banned) each day between "2013-10-01" and "2013-10-03" with at least one trip. Round Cancellation Rate to two decimal points.

trips_data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "client_id": [1, 2, 3, 4, 1, 2, 3, 2, 3, 4],
    "driver_id": [10, 11, 12, 13, 10, 11, 12, 12, 10, 13],
    "city_id": [1, 1, 6, 6, 1, 6, 6, 12, 12, 12],
    "status": [
        "completed", "cancelled_by_driver", "completed", "cancelled_by_client",
        "completed", "completed", "completed", "completed", "completed", "cancelled_by_driver"
    ],
    "request_at": [
        "2013-10-01", "2013-10-01", "2013-10-01", "2013-10-01",
        "2013-10-02", "2013-10-02", "2013-10-02", "2013-10-03",
        "2013-10-03", "2013-10-03"
    ],
}

users_data = {
    "users_id": [1, 2, 3, 4, 10, 11, 12, 13],
    "banned": ["No", "Yes", "No", "No", "No", "No", "No", "No"],
    "role": ["client", "client", "client", "client", "driver", "driver", "driver", "driver"],
}

trips_df_pd = pd.DataFrame(trips_data)
users_df_pd = pd.DataFrame(users_data)

trips_df_pl = pl.DataFrame(trips_data)
users_df_pl = pl.DataFrame(users_data)

pandas solution for Trips and Users

merge_df = trips_df_pd.merge(
    users_df_pd[['users_id', 'banned']],
    left_on = 'client_id',
    right_on = 'users_id'
).merge(
    users_df_pd[['users_id', 'banned']],
    left_on = 'driver_id',
    right_on = 'users_id'
)

# we only care about transactions where neither user was banned
merge_df = merge_df[(merge_df['banned_x'] == "No") & (merge_df['banned_y'] == "No")]

merge_df = merge_df.groupby('request_at', as_index=False).agg(
    cancelled = ('status', lambda x: len([el for el in x if el.startswith("cancelled")])),
    total = ('status', len)
)

merge_df['cancellation_rate'] = merge_df['cancelled']/merge_df['total']
merge_df

polars solution for Trips and Users

(
    trips_df_pl
    .join(
        users_df_pl,
        how = 'left',
        left_on='client_id',
        right_on='users_id',
        suffix='_client'
    )
    .rename({'banned':'client_banned'})
    .join(
        users_df_pl,
        how = 'left',
        left_on='driver_id',
        right_on='users_id'
    )
    .filter(pl.col("client_banned") == 'No', pl.col("banned") == "No")
    .group_by('request_at')
    .agg(
        cancelled_trips = pl.col("status").str.starts_with("cancelled").sum(),
        total_trips = pl.col("status").len()
    )
    .with_columns(
        cancel_rate = pl.col("cancelled_trips")/pl.col('total_trips')
    )
)

sql solution for Trips and Users

duckdb.sql(
    """
    with cte as (
        select status, request_at, u1.banned as banned_client, u2.banned as banned_driver
        from
        trips_df_pd t
        left join
        users_df_pd u1
        on t.client_id = u1.users_id
        left join
        users_df_pd u2
        on t.client_id = u2.users_id
    )

    select request_at, round((count(*) filter (where status ilike 'cancelled%'))/(count(*)), 2) as cancellation_rate
    from cte
    where banned_client = 'No' and banned_driver = 'No'
    group by request_at   
    """
)

# https://leetcode.com/problems/game-play-analysis-iv/description/
# Write a solution to report the fraction of players that logged in again on the day after the day they first logged in, rounded to 2 decimal places. In other words, you need to determine the number of players who logged in on the day immediately following their initial login, and divide it by the number of total players.

activity_data = {
    "player_id": [1, 1, 2, 3, 3],
    "device_id": [2, 2, 3, 1, 4],
    "event_date": ["2016-03-01", "2016-03-02", "2017-06-25", "2016-03-02", "2018-07-03"],
    "games_played": [5, 6, 1, 0, 5],
}
activity_df_pd = pd.DataFrame(activity_data)
activity_df_pd['event_date'] = pd.to_datetime(activity_df_pd['event_date'])

activity_df_pl = pl.DataFrame(activity_data)

pandas solution for Game Play Analysis IV

activity_df_pd['mindate'] = activity_df_pd.groupby("player_id")['event_date'].transform(lambda x: min(x))

activity_df_pd['datediff'] = (activity_df_pd['event_date'] - activity_df_pd['mindate'])

activity_df_pd[activity_df_pd['datediff'].dt.days == 1].drop_duplicates("player_id").shape[0]/activity_df_pd.drop_duplicates("player_id").shape[0]

polars solution for Game Play Analysis IV

(
    activity_df_pl
    .with_columns(
        pl.col('event_date').str.to_datetime()
    )
    .with_columns(
        mindate = pl.col('event_date').min().over(pl.col("player_id"))
    )
    .with_columns(
        datediff = (pl.col("event_date") - pl.col("mindate")).dt.total_days()
    )
    .filter(pl.col('datediff') == 1)
    .unique("player_id")
    .shape[0]
)/activity_df_pl.unique('player_id').shape[0]

sql solution for Game Play Analysis IV

duckdb.sql(
    """
    with cte as (
        select player_id, event_date, min(event_date) over(partition by player_id) as mindate
        from activity_df_pd
    ),
    diffdf as (
        select player_id, date_part('day', (event_date - mindate)) as datediff
        from cte
    )

    select count(distinct player_id)/(select count(distinct player_id) from activity_df_pd)
    from diffdf
    where datediff == 1
    """
)

# https://leetcode.com/problems/tree-node/description/
# Each node in the tree can be one of three types:

# "Leaf": if the node is a leaf node.
# "Root": if the node is the root of the tree.
# "Inner": If the node is neither a leaf node nor a root node.
# Write a solution to report the type of each node in the tree.

tree_node_data = {
    "id": [1, 2, 3, 4, 5],
    "p_id": [None, 1, 1, 2, 2],
}
tree_node_df_pd = pd.DataFrame(tree_node_data)
tree_node_df_pl = pl.DataFrame(tree_node_data)

pandas solution for Tree Node

tree_node_df_pd['child_count'] = tree_node_df_pd['id'].apply(lambda x: sum(tree_node_df_pd['p_id'] == x))
tree_node_df_pd['has_parent'] = ~tree_node_df_pd['p_id'].isnull()

tree_node_df_pd['category'] = tree_node_df_pd['child_count'].case_when(
    [
        (tree_node_df_pd['has_parent'] == False, 'root'),
        (tree_node_df_pd['child_count'] > 0, 'inner'),
        (tree_node_df_pd['has_parent'] == True, 'leaf')
    ]
)

polars solution for Tree Node

(
    tree_node_df_pl
    .with_columns(
        has_parent = pl.col("p_id").is_null(),
        has_child = pl.col("id").map_elements(lambda x: x in tree_node_df_pl['p_id'])
    )
    .with_columns(
        category = pl.when(pl.col("p_id").is_null()).then(pl.lit("Root"))
        .when(pl.col('has_child')).then(pl.lit("Inner"))
        .otherwise(pl.lit("Leaf"))
    )
)

sql solution for Tree Node

duckdb.sql(
    """
    with cte as (
        select t1.id lid, t2.id rid, t1.p_id lpid, t2.p_id rpid
        from tree_node_df_pd t1
        left join
        tree_node_df_pd t2
        on t1.id = t2.p_id
    ),
    count_df as (
        select count(lid) filter(where rid is not null) as num_children, count(lid) filter(where lpid is not null) num_parent
        from cte
        group by lid  
    )

    select case
        when num_parent = 0 then 'root'
        when num_children > 0 then 'inner'
        else 'leaf'
        end as category
    from count_df
    """
)

# +----+------+-----+--------+
# | id | name | sex | salary |
# +----+------+-----+--------+
# | 1  | A    | m   | 2500   |
# | 2  | B    | f   | 1500   |
# | 3  | C    | m   | 5500   |
# | 4  | D    | f   | 500    |
# +----+------+-----+--------+

employee_data = {
    "id": [1, 2, 3, 4, 5, 6],
    "name": ["A", "B", "C", "D", "E", "F"],
    "sex": ["m", "f", "m", "f", "m", "f"],
    "salary": [2500, 1500, 5500, 500, 3000, 4000],
}
employee_df_pl = pl.DataFrame(employee_data)

# update solution, doesn't work with duckdb
# duckdb.sql(
#     """
#     update employee_df_pl
#     set sex = case
#         when sex = 'm' then 'f'
#         else 'm'
#         end
#     """
# )

duckdb.sql(
    """
    select 
        id, 
        name, 
        case when sex = 'm' then 'f' else 'm' end as sex, 
        salary
    from employee_df_pl
    """
)

# https://leetcode.com/problems/market-analysis-i/description/
# Write a solution to find for each user, the join date and the number of orders they made as a buyer in 2019.

users_data = {
    "user_id": [1, 2, 3, 4],
    "join_date": ["2018-01-01", "2018-02-09", "2018-01-19", "2018-05-21"],
    "favorite_brand": ["Lenovo", "Samsung", "LG", "HP"],
}
users_df_pd = pd.DataFrame(users_data)

orders_data = {
    "order_id": [1, 2, 3, 4, 5, 6],
    "order_date": ["2019-08-01", "2018-08-02", "2019-08-03", "2018-08-04", "2018-08-04", "2019-08-05"],
    "item_id": [4, 2, 3, 1, 1, 2],
    "buyer_id": [1, 1, 2, 4, 3, 2],
    "seller_id": [2, 3, 3, 2, 4, 4],
}
orders_df_pd = pd.DataFrame(orders_data)

users_df_pl = pl.DataFrame(users_data)

orders_df_pl = pl.DataFrame(orders_data)

pandas solution for Market Analysis I

join_df = users_df_pd.merge(
    orders_df_pd,
    left_on='user_id',
    right_on='buyer_id',
    how = 'left'
)

# need the year so we can get all 2019's
join_df['year'] = pd.to_datetime(join_df['order_date']).dt.year

join_df.groupby("user_id", as_index=False).agg(
    year_2019_count=('year', lambda x: sum(x == 2019)),
    join_date=('join_date', 'first')
)

polars solution for Market Analysis I

(
    users_df_pl
    .join(
        orders_df_pl,
        how = 'left',
        left_on='user_id',
        right_on='buyer_id'
    )
    .with_columns(
        join_date = pl.col('join_date').str.to_datetime(),
        order_date = pl.col('order_date').str.to_datetime()
    )
    .group_by("user_id")
    .agg(
        order_ct = pl.col("join_date").filter(pl.col("order_date").dt.year() == 2019).len(),
        join_date = pl.col("join_date").first()
    )
)

sql solution for Market Analysis I

duckdb.sql(
    """
    with cte as (
        select user_id, join_date::timestamp as join_date, order_date::timestamp as order_date
        from users_df_pl u
        left join 
        orders_df_pl o
        on u.user_id = o.buyer_id
    )

    select user_id, first(join_date) as join_date, count(order_date) filter (where date_part('year', order_date) = 2019) as order_ct
    from cte
    group by user_id
    """
)

# https://leetcode.com/problems/analyze-organization-hierarchy/description/

# Write a solution to analyze the organizational hierarchy and answer the following:

# Hierarchy Levels: For each employee, determine their level in the organization (CEO is level 1, employees reporting directly to the CEO are level 2, and so on).
# Team Size: For each employee who is a manager, count the total number of employees under them (direct and indirect reports).
# Salary Budget: For each manager, calculate the total salary budget they control (sum of salaries of all employees under them, including indirect reports, plus their own salary).
# Return the result table ordered by the result ordered by level in ascending order, then by budget in descending order, and finally by employee_name in ascending order.

# +-------------+---------------+------------+--------+-------------+
# | employee_id | employee_name | manager_id | salary | department  |
# +-------------+---------------+------------+--------+-------------+
# | 1           | Alice         | null       | 12000  | Executive   |
# | 2           | Bob           | 1          | 10000  | Sales       |
# | 3           | Charlie       | 1          | 10000  | Engineering |
# | 4           | David         | 2          | 7500   | Sales       |
# | 5           | Eva           | 2          | 7500   | Sales       |
# | 6           | Frank         | 3          | 9000   | Engineering |
# | 7           | Grace         | 3          | 8500   | Engineering |
# | 8           | Hank          | 4          | 6000   | Sales       |
# | 9           | Ivy           | 6          | 7000   | Engineering |
# | 10          | Judy          | 6          | 7000   | Engineering |
# +-------------+---------------+------------+--------+-------------+
organization_data = {
    "employee_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "employee_name": ["Alice", "Bob", "Charlie", "David", "Eva", "Frank", "Grace", "Hank", "Ivy", "Judy"],
    "manager_id": [None, 1, 1, 2, 2, 3, 3, 4, 6, 6],
    "salary": [12000, 10000, 10000, 7500, 7500, 9000, 8500, 6000, 7000, 7000],
    "department": ["Executive", "Sales", "Engineering", "Sales", "Sales", "Engineering", "Engineering", "Sales", "Engineering", "Engineering"],
}
organization_df_pd = pd.DataFrame(organization_data)

organization_df_pl = pl.DataFrame(organization_data)

pandas solution for Analyze Organization Hierarchy

level = 1

collect_df = organization_df_pd[organization_df_pd['manager_id'].isna()][['employee_id', 'manager_id', 'employee_name']]

collect_df['rank'] = level

cur_result = collect_df

while True:
    cur_result = cur_result.merge(
        organization_df_pd,
        left_on = 'employee_id',
        right_on = 'manager_id',
        how = 'inner',
        suffixes=('', '_right')
    )[['employee_id_right', 'employee_id', 'employee_name_right']].rename(columns = {
        'employee_id_right': 'employee_id',
        'employee_id': 'manager_id',
        'employee_name_right': 'employee_name'
    })

    if not cur_result.shape[0] > 0:
        break

    level += 1

    cur_result['rank'] = level

    collect_df = pd.concat([collect_df, cur_result])

polars solution for Analyze Organization Hierarchy

# steps in pandas, start from the top level (Alice, whose manager id is null) then recursively perform joins to get the next level of the hierarchy
# with cur_result containing the n-th level of the hierarchy, we perform a join to get the (n+1)-th level of the hierarchy.

level = 1

collect_df = (
    organization_df_pl
    .filter(pl.col("manager_id").is_null())
    .select("employee_id", "manager_id", "employee_name")
    .with_columns(
        rank = pl.lit(level),
        team_size = pl.lit(0)
    )
)
cur_result = collect_df
cur_rows = 1

while cur_rows > 0:
    cur_result = cur_result.join(
        organization_df_pl,
        left_on = 'employee_id',
        right_on= 'manager_id',
        how = 'inner'
    ).select(
        pl.col("employee_id_right").alias("employee_id"),
        pl.col("employee_id").alias("manager_id"),
        pl.col("employee_name_right").alias("employee_name"),
        'rank',
        'team_size'
    )

    if not cur_result.shape[0] > 0:
        break

    collect_df = collect_df.with_columns(
        team_size = pl.when(pl.col('rank') == level).then(pl.lit(cur_result.shape[0])).otherwise(pl.col('team_size'))
    )

    level += 1

    cur_result = cur_result.with_columns(
        rank = pl.lit(level)
    )

    collect_df = pl.concat([collect_df, cur_result])

sql solution for Analyze Organization Hierarchy

duckdb.sql(
    """
    with recursive rank_df as (
        select employee_id, manager_id, salary, 1 as rank
        from organization_df_pd
        where manager_id is null
        union all
        select o.employee_id, o.manager_id, o.salary, c.rank + 1
        from rank_df c
        inner join organization_df_pd o
        on c.employee_id = o.manager_id
    ),
    -- Recursive CTE to get all subordinates for each manager
    subordinates as (
        select employee_id as manager_id, employee_id as subordinate_id, salary as subordinate_salary
        from organization_df_pd
        union all
        select s.manager_id, o.employee_id as subordinate_id, o.salary as subordinate_salary
        from subordinates s
        join organization_df_pd o
        on s.subordinate_id = o.manager_id
    ),
    -- Calculate team size and budget for each manager
    cumulative_df as (
        select 
            r.employee_id, 
            r.manager_id, 
            r.salary, 
            r.rank,
            count(s.subordinate_id) - 1 as team_size, -- exclude self
            sum(s.subordinate_salary) as budget
        from rank_df r
        left join subordinates s
        on r.employee_id = s.manager_id
        group by r.employee_id, r.manager_id, r.salary, r.rank
    )

    select * from cumulative_df
    """
)