Modelling RfQs in Dealer to Client Markets#

Generative models for the request for quote activity#

Simulation of RfQs arrival and client attrition#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import poisson, norm
from collections import deque

def run_simulation(N_clients=50,
                   T=200,
                   reservation_price_mean=100,
                   reservation_price_std_pct=0.1,
                   lambda_mean=1,
                   lambda_std=0.05,
                   prior_mean=60,
                   prior_std=10,
                   res_price_noise_std_pct=0.2,
                   hit_rate_target=0.4,
                   window_size=10,
                   attrition_threshold=0.1):

    """
    Simulate RFQ interactions with Bayesian quantile pricing and client attrition.
    Clients stop trading if their moving-average hit rate over `window_size` days falls below `attrition_threshold`.

    Returns:
        rfq_df: DataFrame of RfQ events
        active_history: list of active client counts per day
        activity_matrix: binary DataFrame of daily activity (clients x days)
    """
    np.random.seed(42)
    # Generate distribution of reservation prices for the segment of clients
    reservation_prices = np.random.normal(reservation_price_mean,
                                  reservation_price_mean * reservation_price_std_pct,
                                  N_clients)
    # Reservation prices are noisy given potential changing market conditions
    res_price_noise_std = res_price_noise_std_pct * reservation_price_mean
    res_price_noise_var = res_price_noise_std**2
    # Generate distribution of RfQ intensities for the segment of clients
    lambdas = np.abs(np.random.normal(lambda_mean, lambda_std, N_clients))
    clients_posterior = {i: {'mean': prior_mean, 'var': prior_std**2,
                             'n': 0, 'sum_obs': 0.0}
                         for i in range(N_clients)}
    z = norm.ppf(1 - hit_rate_target)
    active = np.ones(N_clients, dtype=bool)
    # Track per-client active status at end of each day
    active_flags = np.zeros((T, N_clients), dtype=bool)
    daily_history = [deque(maxlen=window_size) for _ in range(N_clients)]
    active_history = []
    records = []
    Y = np.zeros((T, N_clients), dtype=int)
    for t in range(T):
        daily_hits = np.zeros(N_clients, dtype=int)
        daily_reqs = np.zeros(N_clients, dtype=int)
        for i in range(N_clients):
            if not active[i]: continue
            n_rfq = poisson.rvs(lambdas[i])
            if n_rfq > 0: Y[t,i] = 1
            for _ in range(n_rfq):
                post = clients_posterior[i]
                price = max(0.0, post['mean'] + np.sqrt(post['var'] + res_price_noise_var) * z)
                r = norm.rvs(reservation_prices[i], res_price_noise_std)
                # Trading happens when price offered is lower than the reservation price
                hit = price <= r
                daily_reqs[i] += 1; daily_hits[i] += int(hit)
                # The dealer updates the estimation of the reservation price of the client
                post['n'] += 1; post['sum_obs'] += r
                post_var = 1/(1/prior_std**2 + post['n']/res_price_noise_var)
                post['var'] = post_var
                post['mean'] = post_var*(prior_mean/prior_std**2 + post['sum_obs']/res_price_noise_var)
                records.append({'time': t, 'client_id': i, 'price': price, 'hit': hit})
        for i in range(N_clients):
            if not active[i] or daily_reqs[i]==0: continue
            rate = daily_hits[i]/daily_reqs[i]
            daily_history[i].append(rate)
            # A client stops sending RfQs to the dealer if the hit & miss is too low
            if len(daily_history[i])==window_size and np.mean(daily_history[i])<attrition_threshold:
                active[i] = False
        active_history.append(active.sum())
        active_flags[t, :] = active.copy()
    rfq_df = pd.DataFrame(records)
    activity = pd.DataFrame(Y, columns=[f'client_{i}' for i in range(N_clients)])
    active_df = pd.DataFrame(active_flags, columns=[f'client_{i}' for i in range(N_clients)])
    return rfq_df, active_history, activity, active_df
from scipy.special import betaln
from scipy.optimize import minimize

# ----------------------
# Segment-level Model
# ----------------------
def estimate_segment_params(activity_matrix):
    def log_marginal_likelihood(D, alpha, beta, gamma, delta):
        n, x = len(D), D.sum()
        last = np.where(D==1)[0]
        r = n - (last[-1]+1) if last.size>0 else n
        logA = (betaln(alpha+x, beta+n-x)-betaln(alpha,beta)
                +betaln(gamma, delta+n)-betaln(gamma,delta))
        logB = [(betaln(alpha+x, beta+n-x-i)-betaln(alpha,beta)
                 +betaln(gamma+1, delta+n-i)-betaln(gamma,delta))
                for i in range(1, r+1)]
        mags = [logA] + logB; m_max = max(mags)
        return m_max + np.log(sum(np.exp(m - m_max) for m in mags))
    def neg_ll(params):
        alpha, beta, gamma, delta = np.exp(params)
        return -sum(log_marginal_likelihood(activity_matrix.iloc[:end, j].values,
                                             alpha, beta, gamma, delta)
                    for j in range(activity_matrix.shape[1])
                    for end in [activity_matrix.iloc[:,:].values.shape[0]])
    res = minimize(neg_ll, np.log([1,1,1,1]), method='L-BFGS-B', bounds=[(-5,5)]*4)
    return np.exp(res.x)

def attrition_probability(D, alpha, beta, gamma, delta):
    n, x = len(D), D.sum()
    last = np.where(D==1)[0]
    r = n - (last[-1]+1) if last.size>0 else n
    logA = (betaln(alpha+x, beta+n-x)-betaln(alpha,beta)
            +betaln(gamma, delta+n)-betaln(gamma,delta))
    logB = [(betaln(alpha+x, beta+n-x-i)-betaln(alpha,beta)
             +betaln(gamma+1, delta+n-i)-betaln(gamma,delta))
            for i in range(1, r+1)]
    mags = [logA] + logB; m_max = max(mags)
    logL = m_max + np.log(sum(np.exp(m - m_max) for m in mags))
    P_active = np.exp(logA - logL)
    return 1 - P_active
from sklearn.metrics import (
    confusion_matrix, accuracy_score,
    precision_score, recall_score,
    roc_curve, auc
)

# ----------------------
# Workflow: Simulate, Train, Test
# ----------------------
T_train, T_test = 100, 100
T_tot = T_train + T_test
N_clients = 50
rfq_all, active_all, Y_all, active_df = run_simulation(N_clients = N_clients, T=T_train+T_test)

Y_train = Y_all.iloc[:T_train].reset_index(drop=True)
Y_test  = Y_all.iloc[T_train:].reset_index(drop=True)
rfq_test = rfq_all[rfq_all['time']>=T_train].copy()
rfq_test['time'] -= T_train
active_test = active_all[T_train:]

alpha, beta, gamma, delta = estimate_segment_params(Y_train)
print(f"Params: α={alpha:.2f}, β={beta:.2f}, γ={gamma:.2f}, δ={delta:.2f}")

# ----------------------
# Risk Scoring
# ----------------------
risk_records = []
for j in range(Y_test.shape[1]):
    hist = []
    for t in range(T_test):
        hist.append(Y_test.iloc[t, j])
        D = np.array(hist, dtype=int)
        p_inact = attrition_probability(D, alpha, beta, gamma, delta)
        risk_records.append({
            'time': t,
            'client_id': j,
            'p_inactive': p_inact,
            'alert': p_inact > 0.5  # thresholded prediction
        })

risk_df = pd.DataFrame(risk_records)

# Label inactivity
risk_df['inactive'] = risk_df.apply(
    lambda r: not active_df.loc[r['time'] + T_train, f'client_{r["client_id"]}'],
    axis=1
)

# ----------------------
# Evaluation Metrics
# ----------------------

# Ground truth and predictions
y_true   = risk_df['inactive']
y_pred   = risk_df['alert']           # at 0.5 threshold
y_scores = risk_df['p_inactive']      # raw probabilities

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("Confusion Matrix:")
print(f"  TN={tn}, FP={fp}, FN={fn}, TP={tp}")

# Accuracy, Precision, Recall at 0.5
accuracy  = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall    = recall_score(y_true, y_pred)
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")

# ROC curve & AUC
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
print(f"AUC      : {roc_auc:.4f}")

# Plot ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()
Params: α=148.41, β=85.31, γ=0.08, δ=148.41
Confusion Matrix:
  TN=4365, FP=0, FN=76, TP=559
Accuracy : 0.9848
Precision: 1.0000
Recall   : 0.8803
AUC      : 0.9884
../_images/dfc582116691cbf2ce91f9502cd45738ddefc5d3ba39583012ec1bfba3d3df2c.png
# ----------------------
# Plots: Hit Rate & Attrition Over Full Simulation
# ----------------------
# Active-client hit rate over time
hit_rate_active = []
days = np.arange(T_tot)
for t in days:
    act_clients = active_df.iloc[t]
    active_ids = [int(col.split('_')[1]) for col, flag in act_clients.items() if flag]
    daily = rfq_all[rfq_all['time'] == t]
    rates = []
    for cid in active_ids:
        cr = daily[daily['client_id'] == cid]
        if not cr.empty:
            rates.append(cr['hit'].mean())
    hit_rate_active.append(np.mean(rates) if rates else np.nan)

plt.figure(figsize=(10,5))
plt.plot(days, hit_rate_active, label='Avg Hit Rate', color='blue')
plt.title('Average Hit Rate Over Time')
plt.xlabel('Day'); plt.ylabel('Hit Rate'); plt.grid(True); plt.legend()

# Attrition rate over time
attr_rate = [100 * (Y_all.shape[1] - x) / Y_all.shape[1] for x in active_all]
plt.figure(figsize=(10,5))
plt.plot(days, attr_rate, label='% Inactive', color='red')
plt.title('Cumulative Attrition Rate Over Time')
plt.xlabel('Day'); plt.ylabel('% Inactive'); plt.grid(True); plt.legend()
<matplotlib.legend.Legend at 0x31037edf0>
../_images/05c9c7aa5098a6463bc01f4c28ef067b5f8537e190f312f9cd20d5013c2d652f.png ../_images/00df5c45b747c5781d672ac7b8a47af6ff4ddac8212d7f225090f815d186c214.png
# ----------------------
# Model Fit Visualization
# ----------------------
# Compare predicted attrition vs actual on test
pred_rate = risk_df.groupby('time')['p_inactive'].mean()
actual_rate = [100*(N_clients-x)/N_clients for x in active_test]
plt.figure(figsize=(10,5))
plt.plot(pred_rate.index, 100*pred_rate.values, label='Predicted % Inactive')
plt.plot(actual_rate, label='Actual % Inactive')
plt.title('Predicted vs Actual Attrition (Test Set)')
plt.xlabel('Day'); plt.ylabel('% Inactive'); plt.legend(); plt.grid(True)
../_images/d137c85f3ef48d476e5fa07603a94a61f77574ad85f0cebc66f61ff3b4fcb38b.png
import numpy as np
import matplotlib.pyplot as plt

# ----------------------
# Function: Plot Clients by ID (single figure with subplots)
# ----------------------
def plot_clients_subplots(client_ids):
    """
    For each client in client_ids, create a subplot in one figure:
      - Left y-axis: active (blue=active, red=inactive) and trading days (green), with numeric ticks (0/1)
      - Right y-axis: attrition probability over time
      - Separate legends: left for 'Active', 'Inactive', 'Traded'; right for 'P(Inactive)'
      - Title of each subplot shows the client ID
    """
    n = len(client_ids)
    fig, axes = plt.subplots(nrows=n, ncols=1, figsize=(12, 4*n), sharex=True)
    if n == 1:
        axes = [axes]
        
    days = np.arange(T_test)

    for ax1, cid in zip(axes, client_ids):
        # gather P(inactive) values
        p_vals = [
            risk_df.loc[
                (risk_df['client_id'] == cid) & (risk_df['time'] == t),
                'p_inactive'
            ].item()
            for t in days
        ]
        # latent active flags and trading days
        active_flag = active_df[T_train:][f'client_{cid}'].values
        trade_days  = Y_test.iloc[:, cid]

        # Left axis: Active vs Inactive
        idx_active   = days[active_flag]
        idx_inactive = days[~active_flag]
        h_active   = ax1.scatter(idx_active,   [1]*len(idx_active),   c='blue', marker='s', alpha=0.3, label='Active')
        h_inactive = ax1.scatter(idx_inactive, [1]*len(idx_inactive), c='red',  marker='s', alpha=0.3, label='Inactive')
        # Left axis: actual trades (0 or 1)
        h_trade = ax1.scatter(trade_days.index, trade_days.values,
                              c='green', marker='o', label='Traded')

        # Set numeric ticks on left y-axis
        ax1.set_ylim(-0.2, 1.2)
        ax1.set_yticks([0, 1])
        ax1.set_ylabel('Activity (0/1)')
        ax1.set_title(f'Client {cid} Activity and Attrition Prob')

        # Left-axis legend
        ax1.legend(handles=[h_active, h_inactive, h_trade],
                   labels=['Active', 'Inactive', 'Traded'],
                   loc='upper left')

        # Right axis: P(Inactive)
        ax2 = ax1.twinx()
        h_prob, = ax2.plot(days, p_vals, c='black', label='P(Inactive)')
        ax2.set_ylim(0, 1)
        ax2.set_ylabel('P(Inactive)')

        # Right-axis legend
        ax2.legend(handles=[h_prob], loc='upper right')

    axes[-1].set_xlabel('Day')
    plt.tight_layout()
    plt.show()

# Example: plot select clients in one figure
#plot_clients_subplots([3, 8, 9, 18])
plot_clients_subplots([3, 8, 9, 30])
../_images/41c869da6de3e027316759a5512b58c54fb18b4a028df91fdd8bc7e01ebe4872.png

Simulation of client abnormal behavior#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import poisson, norm
from collections import deque

def run_simulation(N_clients=50,
                   T=200,
                   reservation_price_mean=100,
                   reservation_price_std_pct=0.1,
                   lambda_mean=1,
                   lambda_std=0.05,
                   prior_mean=60,
                   prior_std=10,
                   reservation_price_noise_std=None,
                   discount_trigger_pct=0.35,  # trigger threshold: 0.5 means 50% lower
                   boosted_rate_factor = 10,
                   hit_rate_target=0.4,
                   window_size=10,
                   attrition_threshold=0.1,
                   random_seed=42):

    """
    Simulate RFQ interactions with Bayesian quantile pricing and client attrition.
    Adds a mechanism: if a client gets a quote at least `discount_trigger_pct` below their reservation price,
    they double their RFQ rate until the dealer quotes above their reservation price.
    """

    np.random.seed(random_seed)

    # Generate distribution of reservation prices
    reservation_prices = np.random.normal(reservation_price_mean,
                                          reservation_price_mean * reservation_price_std_pct,
                                          N_clients)

    # Noise in reservation prices
    fair_price = reservation_price_mean
    res_price_noise_std = reservation_price_noise_std or (fair_price * reservation_price_std_pct * 2)
    res_price_noise_var = res_price_noise_std ** 2

    # Base RfQ intensities
    lambdas = np.abs(np.random.normal(lambda_mean, lambda_std, N_clients))
    base_lambdas = lambdas.copy()  # store for reset later

    # Posterior beliefs
    clients_posterior = {i: {'mean': prior_mean, 'var': prior_std ** 2,
                             'n': 0, 'sum_obs': 0.0}
                         for i in range(N_clients)}

    z = norm.ppf(1 - hit_rate_target)
    active = np.ones(N_clients, dtype=bool)

    # Flags for tracking "boosted" mode
    boosted = np.zeros(N_clients, dtype=bool)

    # Track daily active/boosted status
    active_flags = np.zeros((T, N_clients), dtype=bool)
    boosted_flags = np.zeros((T, N_clients), dtype=bool)

    daily_history = [deque(maxlen=window_size) for _ in range(N_clients)]
    active_history = []
    records = []
    Y = np.zeros((T, N_clients), dtype=int)  # binary: whether client sent at least one RFQ that day

    for t in range(T):
        daily_hits = np.zeros(N_clients, dtype=int)
        daily_reqs = np.zeros(N_clients, dtype=int)

        for i in range(N_clients):
            if not active[i]:
                continue

            n_rfq = poisson.rvs(lambdas[i])
            if n_rfq > 0:
                Y[t, i] = 1

            for _ in range(n_rfq):
                post = clients_posterior[i]
                price = max(0.0, post['mean'] + np.sqrt(post['var'] + res_price_noise_var) * z)
                r = norm.rvs(reservation_prices[i], res_price_noise_std)

                # Trading happens when price <= reservation price
                hit = price <= r
                daily_reqs[i] += 1
                daily_hits[i] += int(hit)

                # Check discount trigger
                if price <= (1 - discount_trigger_pct) * r:
                    boosted[i] = True
                    lambdas[i] = base_lambdas[i] * boosted_rate_factor
                elif boosted[i] and price > r / (1-discount_trigger_pct):
                    boosted[i] = False
                    lambdas[i] = base_lambdas[i]

                # Update posterior belief
                post['n'] += 1
                post['sum_obs'] += r
                post_var = 1 / (1 / (prior_std ** 2) + post['n'] / res_price_noise_var)
                post['var'] = post_var
                post['mean'] = post_var * (prior_mean / (prior_std ** 2) +
                                           post['sum_obs'] / res_price_noise_var)

                records.append({'time': t,
                                'client_id': i,
                                'price': price,
                                'reservation': r,
                                'hit': hit,
                                'boosted': boosted[i]})

        # Update attrition status
        for i in range(N_clients):
            if not active[i] or daily_reqs[i] == 0:
                continue
            rate = daily_hits[i] / daily_reqs[i]
            daily_history[i].append(rate)
            if len(daily_history[i]) == window_size and np.mean(daily_history[i]) < attrition_threshold:
                active[i] = False

        active_history.append(int(active.sum()))
        active_flags[t, :] = active.copy()
        boosted_flags[t, :] = boosted.copy()

    rfq_df = pd.DataFrame(records)
    activity_df = pd.DataFrame(Y, columns=[f'client_{i}' for i in range(N_clients)])
    active_df = pd.DataFrame(active_flags, columns=[f'client_{i}' for i in range(N_clients)])
    boosted_df = pd.DataFrame(boosted_flags, columns=[f'client_{i}' for i in range(N_clients)])

    return rfq_df, active_history, activity_df, active_df, boosted_df
# Run simulation
rfq_df, active_history, activity_df, active_df, boosted_df = run_simulation()

# Plot active and boosted clients over time
plt.figure(figsize=(10,5))
plt.plot(active_df.sum(axis=1), label='Active Clients')
plt.plot(boosted_df.sum(axis=1), label='Boosted Clients', linestyle='--')
plt.xlabel('Day')
plt.ylabel('Number of Clients')
plt.title('Active vs Boosted Clients Over Time')
plt.legend()
plt.grid(True)
plt.show()

# Heatmap of boosted status
plt.figure(figsize=(12,6))
plt.imshow(boosted_df.T, aspect='auto', cmap='Reds')
plt.colorbar(label='Boosted Status (1=Yes, 0=No)')
plt.xlabel('Day')
plt.ylabel('Client ID')
plt.title('Boosted Clients Over Time')
plt.show()
../_images/381aa8f918612efd8b28f38396f8cc9bc4399ba0f8d71408c2233a3670fe8c97.png ../_images/3c7b2e4c2b6790c446f3f9b947b57a544ece8c4e162b10ca87a1a62027975ea1.png
from scipy.special import betaln, gammaln
from scipy.optimize import minimize

# -----------------------------
#  Beta-Binomial mixture MLE with shared mean
# -----------------------------

def log_beta_binom_pmf(x, n, alpha, beta):
    # log [ C(n,x) * B(alpha+x, beta+n-x) / B(alpha,beta) ]
    return (
        gammaln(n + 1) - gammaln(x + 1) - gammaln(n - x + 1)
        + betaln(alpha + x, beta + n - x)
        - betaln(alpha, beta)
    )

def neg_log_likelihood(params, xs, ns):
    # params = [logit_mu, log_k_g, log_k_b, logit_qg]
    logit_mu, log_k_g, log_k_b, logit_qg = params
    
    mu = 1 / (1 + np.exp(-logit_mu))  # shared mean
    k_g = np.exp(log_k_g) + 1e-8
    k_b = np.exp(log_k_b) + 1e-8
    qg  = 1 / (1 + np.exp(-logit_qg))

    # convert to alpha, beta
    alpha_g, beta_g = mu * k_g, (1 - mu) * k_g
    alpha_b, beta_b = mu * k_b, (1 - mu) * k_b

    # mixture likelihood per client
    ll = []
    for x, n in zip(xs, ns):
        log_p_g = log_beta_binom_pmf(x, n, alpha_g, beta_g)
        log_p_b = log_beta_binom_pmf(x, n, alpha_b, beta_b)
        # log-sum-exp for mixture
        a = np.log(qg) + log_p_g
        b = np.log(1 - qg) + log_p_b
        m = max(a, b)
        ll_i = m + np.log(np.exp(a - m) + np.exp(b - m))
        ll.append(ll_i)
    return -np.sum(ll)

def fit_beta_mixture_mle(activity_df, train_days):
    """
    activity_df: (T x N) binary DataFrame
    train_days: number of days used for training (first half)
    Returns dict with fitted parameters.
    """
    T, N = activity_df.shape
    A = activity_df.iloc[:train_days].values  # (train_days x N)
    xs = A.sum(axis=0)  # successes per client in training
    ns = np.full_like(xs, fill_value=train_days)

    # rough start for mean
    p_hat = xs.mean() / train_days
    logit_mu0 = np.log(p_hat / (1 - p_hat + 1e-8))

    start = np.array([
        logit_mu0,
        np.log(10.0),   # k_g initial concentration
        np.log(2.0),    # k_b initial concentration
        0.0             # logit(0.5)
    ], dtype=float)

    res = minimize(neg_log_likelihood, start, args=(xs, ns), method='L-BFGS-B')
    logit_mu, log_k_g, log_k_b, logit_qg = res.x
    
    mu  = 1 / (1 + np.exp(-logit_mu))
    k_g = np.exp(log_k_g) + 1e-8
    k_b = np.exp(log_k_b) + 1e-8
    
    params = {
        'mu':     float(mu),
        'alpha_g': float(mu * k_g), 'beta_g': float((1 - mu) * k_g),
        'alpha_b': float(mu * k_b), 'beta_b': float((1 - mu) * k_b),
        'q_g':    float(1 / (1 + np.exp(-logit_qg))),
        'success': bool(res.success),
        'message': res.message
    }
    return params

def posterior_good(x, n, params):
    a_g = params['alpha_g']; b_g = params['beta_g']
    a_b = params['alpha_b']; b_b = params['beta_b']
    q_g = params['q_g']

    log_p_g = log_beta_binom_pmf(x, n, a_g, b_g)
    log_p_b = log_beta_binom_pmf(x, n, a_b, b_b)

    a = np.log(q_g) + log_p_g
    b = np.log(1 - q_g) + log_p_b
    m = max(a, b)
    num = np.exp(a - m)
    den = num + np.exp(b - m)
    return float(num / den)
# Fit on first half, removing first 25 days until it settles
T = activity_df.shape[0]
mid = T // 2
params = fit_beta_mixture_mle(activity_df[25:], train_days=mid)

# Detect abnormalities on second half using a sliding window
n_window = 10  # window size for detection, can be adjusted
threshold = 0.5  # classify as abnormal if p(good|D) < threshold

N = activity_df.shape[1]
abnormal_flags = np.zeros_like(activity_df.values, dtype=bool)

# For each day t in the second half, use the last n_window days (bounded within the second half)
for t in range(mid, T):
    start = max(mid, t - n_window + 1)
    end = t + 1
    window = activity_df.iloc[start:end].values  # (w x N)
    n = window.shape[0]
    x = window.sum(axis=0)
    # Posterior for each client
    p_good = np.array([posterior_good(int(xi), int(n), params) for xi in x])
    abnormal_flags[t, :] = p_good < threshold

abnormal_df = pd.DataFrame(abnormal_flags, columns=activity_df.columns, index=activity_df.index)
params
{'mu': 0.6459024041525655,
 'alpha_g': 17819.14219878137,
 'beta_g': 9768.837168102045,
 'alpha_b': 0.6772910514791353,
 'beta_b': 0.3713055277018204,
 'q_g': 0.892871535754001,
 'success': True,
 'message': 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'}
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import beta

def plot_fitted_betas(params, bins=200):
    """
    Plot fitted good vs bad Beta distributions.
    params: dict returned by fit_beta_mixture_mle
    """
    a_g, b_g = params['alpha_g'], params['beta_g']
    a_b, b_b = params['alpha_b'], params['beta_b']
    q_g = params['q_g']

    # Grid
    x = np.linspace(0, 1, bins)

    # Densities
    pdf_g = beta.pdf(x, a_g, b_g)
    pdf_b = beta.pdf(x, a_b, b_b)

    plt.figure(figsize=(10, 6))
    plt.plot(x, pdf_g, label=f"Good (α={a_g:.2f}, β={b_g:.2f})", lw=2, color="blue")
    plt.plot(x, pdf_b, label=f"Bad  (α={a_b:.2f}, β={b_b:.2f})", lw=2, color="red")
    plt.fill_between(x, pdf_g, alpha=0.2, color="blue")
    plt.fill_between(x, pdf_b, alpha=0.2, color="red")

    plt.title("Fitted Beta Distributions (Good vs Bad)")
    plt.xlabel("Success probability")
    plt.ylabel("Density")
    plt.legend()
    plt.grid(True, ls="--", alpha=0.5)
    plt.show()


plot_fitted_betas(params)
../_images/1cb1662963f25a6e399f4616a42b9f5d11d5f730541d1e668d2167c2d9147989.png
# Use a previous window (exclude day t) when computing P(good|D)
n_window = 10
threshold = 0.5

def build_pgood_panel(activity_df, boosted_df, active_df, params, mid, n_window, threshold):
    T, N = activity_df.shape
    rows = []
    # Start at mid + n_window to ensure a full *previous* window exists
    for t in range(mid + n_window, T):
        start = t - n_window
        end = t  # exclude day t
        window = activity_df.iloc[start:end]  # (n_window x N)
        x = window.sum(axis=0).values.astype(int)  # successes per client in previous window
        n = n_window
        # compute p_good for each client
        p_goods = []
        for j in range(N):
            p = posterior_good(int(x[j]), int(n), params)
            p_goods.append(p)
        p_goods = np.array(p_goods)
        # flags at time t
        boosted_t = boosted_df.iloc[t].values.astype(bool)
        active_t  = active_df.iloc[t].values.astype(bool)
        inactive_t = ~active_t
        traded_today = activity_df.iloc[t].values.astype(int)
        for j in range(N):
            rows.append({
                'day': t,
                'client_id': j,
                'x': int(x[j]),
                'n': int(n),
                'p_good': float(p_goods[j]),
                'abnormal': bool(p_goods[j] < threshold),
                'boosted': bool(boosted_t[j]),
                'inactive': bool(inactive_t[j]),
                'traded_today': int(traded_today[j])
            })
    panel = pd.DataFrame(rows)
    return panel

panel_df = build_pgood_panel(activity_df, boosted_df, active_df, params, mid, n_window, threshold)

# Correctly aligned daily summary for second half *with windowing*
days = sorted(panel_df['day'].unique())
daily_summary = panel_df.groupby('day').agg(
    abnormal_count=('abnormal', 'sum'),
    boosted_count=('boosted', 'sum'),
    inactive_count=('inactive', 'sum')
).reset_index()

# Overlap summary on client-day panel
overlap_summary = {
    'days_evaluated': int(len(days)),
    'clients': int(N),
    'total_client_days': int(panel_df.shape[0]),
    'abnormal_client_days': int(panel_df['abnormal'].sum()),
    'boosted_client_days': int(panel_df['boosted'].sum()),
    'inactive_client_days': int(panel_df['inactive'].sum()),
    'abnormal_and_boosted': int((panel_df['abnormal'] & panel_df['boosted']).sum()),
    'abnormal_and_inactive': int((panel_df['abnormal'] & panel_df['inactive']).sum()),
    'abnormal_not_boosted_or_inactive': int((panel_df['abnormal'] & ~panel_df['boosted'] & ~panel_df['inactive']).sum())
}
overlap_df2 = pd.DataFrame([overlap_summary])

# Heatmap of p_good by client (days x clients) for visual inspection
# Build a matrix with rows=days, cols=clients
days_idx = daily_summary['day'].values
pgood_mat = np.full((len(days_idx), N), np.nan)
for i, t in enumerate(days_idx):
    slice_t = panel_df[panel_df['day'] == t].sort_values('client_id')
    pgood_mat[i, :] = slice_t['p_good'].values

plt.figure(figsize=(12,6))
plt.imshow(pgood_mat.T, aspect='auto')
plt.colorbar(label='P(good | previous window)')
plt.xlabel('Day index in second half')
plt.ylabel('Client ID')
plt.title('Per-client P(good|D) over time (previous window)')
plt.show()

   
../_images/a52691a97a0a6cc384724d0db12951fb986bad468d049feff88c4ed5de4a1f17.png
from matplotlib.lines import Line2D

def plot_clients_activity_prob(clients, df):
    """
    One figure with one row per client.
    On boosted days, only the 'Boosted' marker is shown (others hidden).
    """
    clients = list(clients)
    n = len(clients)
    if n == 0:
        return
    
    fig, axes = plt.subplots(n, 1, figsize=(12, 2.8 * n), sharex=True, constrained_layout=True)
    if n == 1:
        axes = [axes]
    
    # Build a single, global legend
    legend_elems = [
        Line2D([0],[0], marker='s', linestyle='None', color='blue',  alpha=0.3, label='Active'),
        Line2D([0],[0], marker='x', linestyle='None', color='gray',  alpha=0.3, label='Inactive'),
        Line2D([0],[0], marker='o', linestyle='None', color='red',   alpha=0.5, label='Traded'),
        Line2D([0],[0], marker='^', linestyle='None', color='green', alpha=0.7, label='Boosted'),
        Line2D([0],[0], linestyle='-', color='black', label='P(good|D)'),
    ]
    
    x_min, x_max = None, None
    for ax, cid in zip(axes, clients):
        df_c = df[df['client_id'] == cid].sort_values('day').copy()
        if df_c.empty:
            ax.set_title(f"Client {cid} (no data)")
            ax.set_yticks([])
            continue
        
        # Masks
        boosted_mask  = df_c['boosted']
        active_mask   = (~df_c['inactive']) & (~boosted_mask)
        inactive_mask = df_c['inactive'] & (~boosted_mask)
        traded_mask   = (df_c['traded_today'] == 1) & (~boosted_mask)
        
        # Scatter statuses
        ax.scatter(df_c['day'][active_mask],   [1]*active_mask.sum(),   c='blue',  marker='s', alpha=0.3)
        ax.scatter(df_c['day'][inactive_mask], [1]*inactive_mask.sum(), c='gray',  marker='x', alpha=0.3)
        ax.scatter(df_c['day'][traded_mask],   [1]*traded_mask.sum(),   c='red',   marker='o', alpha=0.5)
        ax.scatter(df_c['day'][boosted_mask],  [1]*boosted_mask.sum(),  c='green', marker='^', alpha=0.7, zorder=3)
        
        ax.set_yticks([])
        ax.set_ylim(0.5, 1.5)
        
        # p_good
        ax2 = ax.twinx()
        ax2.plot(df_c['day'], df_c['p_good'], color='black', lw=2)
        ax2.set_ylim(0, 1)
        ax2.set_ylabel("P(good|D)")
        
        ax.set_title(f"Client {cid}")
        
        # Track x-lims
        dmin, dmax = df_c['day'].min(), df_c['day'].max()
        x_min = dmin if x_min is None else min(x_min, dmin)
        x_max = dmax if x_max is None else max(x_max, dmax)
    
    axes[-1].set_xlabel("Day")
    if x_min is not None and x_max is not None:
        axes[-1].set_xlim(x_min, x_max)
    
    fig.suptitle("Selected clients activity", y=1.02)
    
    # Move legend below figure
    fig.legend(handles=legend_elems, loc='lower center', ncol=5, frameon=False, bbox_to_anchor=(0.5, -0.03))
    plt.show()


plot_clients_activity_prob([30,31,35, 37], panel_df)
../_images/5169899f0c2d2fc9171fd10e9d983b66812094e4bc224d521c8da326e4e49ab2.png
from sklearn.metrics import confusion_matrix

def _cm_df(cm, labels=("Pred 0", "Pred 1"), index=("True 0", "True 1")):
    """Pretty dataframe for a 2x2 confusion matrix."""
    return pd.DataFrame(cm, index=index, columns=labels)

def evaluate_confusions(panel_df, normalize=False):
    """
    Evaluate confusion matrices without recomputing thresholds.

    Uses:
      - df['abnormal'] as the model's predicted abnormal label (already thresholded).
      - df['inactive'] and df['boosted'] as operational labels.
      - Global abnormality = (inactive OR boosted).

    Returns dict with both raw arrays and pretty DataFrames.
    Set normalize=True to return per-true-class rates instead of counts.
    """
    df = panel_df.copy()

    # Ensure ints
    y_abnormal = df["abnormal"].astype(int).values
    y_inactive = df["inactive"].astype(int).values
    y_boosted  = df["boosted"].astype(int).values

    # Global abnormality (prediction-side for the overall CM)
    y_global_abn = (df["inactive"] | df["boosted"]).astype(int).values

    # Normalization mode for sklearn
    norm_mode = "true" if normalize else None

    results = {}

    # 1) Inactive (truth) vs Abnormal (model)
    cm_inactive = confusion_matrix(y_inactive, y_abnormal, labels=[0,1], normalize=norm_mode)
    results["inactive_vs_abnormal"] = cm_inactive
    results["inactive_vs_abnormal_df"] = _cm_df(cm_inactive)

    # 2) Boosted (truth) vs Abnormal (model)
    cm_boosted = confusion_matrix(y_boosted, y_abnormal, labels=[0,1], normalize=norm_mode)
    results["boosted_vs_abnormal"] = cm_boosted
    results["boosted_vs_abnormal_df"] = _cm_df(cm_boosted)

    # 3) OVERALL: Abnormal (truth) vs Global abnormality (prediction = inactive OR boosted)
    cm_overall = confusion_matrix(y_abnormal, y_global_abn, labels=[0,1], normalize=norm_mode)
    results["overall_abnormal_vs_global"] = cm_overall
    results["overall_abnormal_vs_global_df"] = _cm_df(cm_overall)

    return results

# Example usage:
cm_results = evaluate_confusions(panel_df, normalize=False)
print("Inactive (truth) vs Abnormal (model):\n", cm_results["inactive_vs_abnormal_df"], "\n")
print("Boosted (truth) vs Abnormal (model):\n", cm_results["boosted_vs_abnormal_df"], "\n")
print("Overall: Abnormal (truth) vs Global (inactive OR boosted) prediction:\n", cm_results["overall_abnormal_vs_global_df"])
Inactive (truth) vs Abnormal (model):
         Pred 0  Pred 1
True 0    3830      74
True 1      56     540 

Boosted (truth) vs Abnormal (model):
         Pred 0  Pred 1
True 0    3867     609
True 1      19       5 

Overall: Abnormal (truth) vs Global (inactive OR boosted) prediction:
         Pred 0  Pred 1
True 0    3811      75
True 1      69     545