Modelling RfQs in Dealer to Client Markets#
Generative models for the request for quote activity#
Simulation of RfQs arrival and client attrition#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import poisson, norm
from collections import deque
def run_simulation(N_clients=50,
T=200,
reservation_price_mean=100,
reservation_price_std_pct=0.1,
lambda_mean=1,
lambda_std=0.05,
prior_mean=60,
prior_std=10,
res_price_noise_std_pct=0.2,
hit_rate_target=0.4,
window_size=10,
attrition_threshold=0.1):
"""
Simulate RFQ interactions with Bayesian quantile pricing and client attrition.
Clients stop trading if their moving-average hit rate over `window_size` days falls below `attrition_threshold`.
Returns:
rfq_df: DataFrame of RfQ events
active_history: list of active client counts per day
activity_matrix: binary DataFrame of daily activity (clients x days)
"""
np.random.seed(42)
# Generate distribution of reservation prices for the segment of clients
reservation_prices = np.random.normal(reservation_price_mean,
reservation_price_mean * reservation_price_std_pct,
N_clients)
# Reservation prices are noisy given potential changing market conditions
res_price_noise_std = res_price_noise_std_pct * reservation_price_mean
res_price_noise_var = res_price_noise_std**2
# Generate distribution of RfQ intensities for the segment of clients
lambdas = np.abs(np.random.normal(lambda_mean, lambda_std, N_clients))
clients_posterior = {i: {'mean': prior_mean, 'var': prior_std**2,
'n': 0, 'sum_obs': 0.0}
for i in range(N_clients)}
z = norm.ppf(1 - hit_rate_target)
active = np.ones(N_clients, dtype=bool)
# Track per-client active status at end of each day
active_flags = np.zeros((T, N_clients), dtype=bool)
daily_history = [deque(maxlen=window_size) for _ in range(N_clients)]
active_history = []
records = []
Y = np.zeros((T, N_clients), dtype=int)
for t in range(T):
daily_hits = np.zeros(N_clients, dtype=int)
daily_reqs = np.zeros(N_clients, dtype=int)
for i in range(N_clients):
if not active[i]: continue
n_rfq = poisson.rvs(lambdas[i])
if n_rfq > 0: Y[t,i] = 1
for _ in range(n_rfq):
post = clients_posterior[i]
price = max(0.0, post['mean'] + np.sqrt(post['var'] + res_price_noise_var) * z)
r = norm.rvs(reservation_prices[i], res_price_noise_std)
# Trading happens when price offered is lower than the reservation price
hit = price <= r
daily_reqs[i] += 1; daily_hits[i] += int(hit)
# The dealer updates the estimation of the reservation price of the client
post['n'] += 1; post['sum_obs'] += r
post_var = 1/(1/prior_std**2 + post['n']/res_price_noise_var)
post['var'] = post_var
post['mean'] = post_var*(prior_mean/prior_std**2 + post['sum_obs']/res_price_noise_var)
records.append({'time': t, 'client_id': i, 'price': price, 'hit': hit})
for i in range(N_clients):
if not active[i] or daily_reqs[i]==0: continue
rate = daily_hits[i]/daily_reqs[i]
daily_history[i].append(rate)
# A client stops sending RfQs to the dealer if the hit & miss is too low
if len(daily_history[i])==window_size and np.mean(daily_history[i])<attrition_threshold:
active[i] = False
active_history.append(active.sum())
active_flags[t, :] = active.copy()
rfq_df = pd.DataFrame(records)
activity = pd.DataFrame(Y, columns=[f'client_{i}' for i in range(N_clients)])
active_df = pd.DataFrame(active_flags, columns=[f'client_{i}' for i in range(N_clients)])
return rfq_df, active_history, activity, active_df
from scipy.special import betaln
from scipy.optimize import minimize
# ----------------------
# Segment-level Model
# ----------------------
def estimate_segment_params(activity_matrix):
def log_marginal_likelihood(D, alpha, beta, gamma, delta):
n, x = len(D), D.sum()
last = np.where(D==1)[0]
r = n - (last[-1]+1) if last.size>0 else n
logA = (betaln(alpha+x, beta+n-x)-betaln(alpha,beta)
+betaln(gamma, delta+n)-betaln(gamma,delta))
logB = [(betaln(alpha+x, beta+n-x-i)-betaln(alpha,beta)
+betaln(gamma+1, delta+n-i)-betaln(gamma,delta))
for i in range(1, r+1)]
mags = [logA] + logB; m_max = max(mags)
return m_max + np.log(sum(np.exp(m - m_max) for m in mags))
def neg_ll(params):
alpha, beta, gamma, delta = np.exp(params)
return -sum(log_marginal_likelihood(activity_matrix.iloc[:end, j].values,
alpha, beta, gamma, delta)
for j in range(activity_matrix.shape[1])
for end in [activity_matrix.iloc[:,:].values.shape[0]])
res = minimize(neg_ll, np.log([1,1,1,1]), method='L-BFGS-B', bounds=[(-5,5)]*4)
return np.exp(res.x)
def attrition_probability(D, alpha, beta, gamma, delta):
n, x = len(D), D.sum()
last = np.where(D==1)[0]
r = n - (last[-1]+1) if last.size>0 else n
logA = (betaln(alpha+x, beta+n-x)-betaln(alpha,beta)
+betaln(gamma, delta+n)-betaln(gamma,delta))
logB = [(betaln(alpha+x, beta+n-x-i)-betaln(alpha,beta)
+betaln(gamma+1, delta+n-i)-betaln(gamma,delta))
for i in range(1, r+1)]
mags = [logA] + logB; m_max = max(mags)
logL = m_max + np.log(sum(np.exp(m - m_max) for m in mags))
P_active = np.exp(logA - logL)
return 1 - P_active
from sklearn.metrics import (
confusion_matrix, accuracy_score,
precision_score, recall_score,
roc_curve, auc
)
# ----------------------
# Workflow: Simulate, Train, Test
# ----------------------
T_train, T_test = 100, 100
T_tot = T_train + T_test
N_clients = 50
rfq_all, active_all, Y_all, active_df = run_simulation(N_clients = N_clients, T=T_train+T_test)
Y_train = Y_all.iloc[:T_train].reset_index(drop=True)
Y_test = Y_all.iloc[T_train:].reset_index(drop=True)
rfq_test = rfq_all[rfq_all['time']>=T_train].copy()
rfq_test['time'] -= T_train
active_test = active_all[T_train:]
alpha, beta, gamma, delta = estimate_segment_params(Y_train)
print(f"Params: α={alpha:.2f}, β={beta:.2f}, γ={gamma:.2f}, δ={delta:.2f}")
# ----------------------
# Risk Scoring
# ----------------------
risk_records = []
for j in range(Y_test.shape[1]):
hist = []
for t in range(T_test):
hist.append(Y_test.iloc[t, j])
D = np.array(hist, dtype=int)
p_inact = attrition_probability(D, alpha, beta, gamma, delta)
risk_records.append({
'time': t,
'client_id': j,
'p_inactive': p_inact,
'alert': p_inact > 0.5 # thresholded prediction
})
risk_df = pd.DataFrame(risk_records)
# Label inactivity
risk_df['inactive'] = risk_df.apply(
lambda r: not active_df.loc[r['time'] + T_train, f'client_{r["client_id"]}'],
axis=1
)
# ----------------------
# Evaluation Metrics
# ----------------------
# Ground truth and predictions
y_true = risk_df['inactive']
y_pred = risk_df['alert'] # at 0.5 threshold
y_scores = risk_df['p_inactive'] # raw probabilities
# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("Confusion Matrix:")
print(f" TN={tn}, FP={fp}, FN={fn}, TP={tp}")
# Accuracy, Precision, Recall at 0.5
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall : {recall:.4f}")
# ROC curve & AUC
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
print(f"AUC : {roc_auc:.4f}")
# Plot ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()
Params: α=148.41, β=85.31, γ=0.08, δ=148.41
Confusion Matrix:
TN=4365, FP=0, FN=76, TP=559
Accuracy : 0.9848
Precision: 1.0000
Recall : 0.8803
AUC : 0.9884

# ----------------------
# Plots: Hit Rate & Attrition Over Full Simulation
# ----------------------
# Active-client hit rate over time
hit_rate_active = []
days = np.arange(T_tot)
for t in days:
act_clients = active_df.iloc[t]
active_ids = [int(col.split('_')[1]) for col, flag in act_clients.items() if flag]
daily = rfq_all[rfq_all['time'] == t]
rates = []
for cid in active_ids:
cr = daily[daily['client_id'] == cid]
if not cr.empty:
rates.append(cr['hit'].mean())
hit_rate_active.append(np.mean(rates) if rates else np.nan)
plt.figure(figsize=(10,5))
plt.plot(days, hit_rate_active, label='Avg Hit Rate', color='blue')
plt.title('Average Hit Rate Over Time')
plt.xlabel('Day'); plt.ylabel('Hit Rate'); plt.grid(True); plt.legend()
# Attrition rate over time
attr_rate = [100 * (Y_all.shape[1] - x) / Y_all.shape[1] for x in active_all]
plt.figure(figsize=(10,5))
plt.plot(days, attr_rate, label='% Inactive', color='red')
plt.title('Cumulative Attrition Rate Over Time')
plt.xlabel('Day'); plt.ylabel('% Inactive'); plt.grid(True); plt.legend()
<matplotlib.legend.Legend at 0x31037edf0>


# ----------------------
# Model Fit Visualization
# ----------------------
# Compare predicted attrition vs actual on test
pred_rate = risk_df.groupby('time')['p_inactive'].mean()
actual_rate = [100*(N_clients-x)/N_clients for x in active_test]
plt.figure(figsize=(10,5))
plt.plot(pred_rate.index, 100*pred_rate.values, label='Predicted % Inactive')
plt.plot(actual_rate, label='Actual % Inactive')
plt.title('Predicted vs Actual Attrition (Test Set)')
plt.xlabel('Day'); plt.ylabel('% Inactive'); plt.legend(); plt.grid(True)

import numpy as np
import matplotlib.pyplot as plt
# ----------------------
# Function: Plot Clients by ID (single figure with subplots)
# ----------------------
def plot_clients_subplots(client_ids):
"""
For each client in client_ids, create a subplot in one figure:
- Left y-axis: active (blue=active, red=inactive) and trading days (green), with numeric ticks (0/1)
- Right y-axis: attrition probability over time
- Separate legends: left for 'Active', 'Inactive', 'Traded'; right for 'P(Inactive)'
- Title of each subplot shows the client ID
"""
n = len(client_ids)
fig, axes = plt.subplots(nrows=n, ncols=1, figsize=(12, 4*n), sharex=True)
if n == 1:
axes = [axes]
days = np.arange(T_test)
for ax1, cid in zip(axes, client_ids):
# gather P(inactive) values
p_vals = [
risk_df.loc[
(risk_df['client_id'] == cid) & (risk_df['time'] == t),
'p_inactive'
].item()
for t in days
]
# latent active flags and trading days
active_flag = active_df[T_train:][f'client_{cid}'].values
trade_days = Y_test.iloc[:, cid]
# Left axis: Active vs Inactive
idx_active = days[active_flag]
idx_inactive = days[~active_flag]
h_active = ax1.scatter(idx_active, [1]*len(idx_active), c='blue', marker='s', alpha=0.3, label='Active')
h_inactive = ax1.scatter(idx_inactive, [1]*len(idx_inactive), c='red', marker='s', alpha=0.3, label='Inactive')
# Left axis: actual trades (0 or 1)
h_trade = ax1.scatter(trade_days.index, trade_days.values,
c='green', marker='o', label='Traded')
# Set numeric ticks on left y-axis
ax1.set_ylim(-0.2, 1.2)
ax1.set_yticks([0, 1])
ax1.set_ylabel('Activity (0/1)')
ax1.set_title(f'Client {cid} Activity and Attrition Prob')
# Left-axis legend
ax1.legend(handles=[h_active, h_inactive, h_trade],
labels=['Active', 'Inactive', 'Traded'],
loc='upper left')
# Right axis: P(Inactive)
ax2 = ax1.twinx()
h_prob, = ax2.plot(days, p_vals, c='black', label='P(Inactive)')
ax2.set_ylim(0, 1)
ax2.set_ylabel('P(Inactive)')
# Right-axis legend
ax2.legend(handles=[h_prob], loc='upper right')
axes[-1].set_xlabel('Day')
plt.tight_layout()
plt.show()
# Example: plot select clients in one figure
#plot_clients_subplots([3, 8, 9, 18])
plot_clients_subplots([3, 8, 9, 30])

Simulation of client abnormal behavior#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import poisson, norm
from collections import deque
def run_simulation(N_clients=50,
T=200,
reservation_price_mean=100,
reservation_price_std_pct=0.1,
lambda_mean=1,
lambda_std=0.05,
prior_mean=60,
prior_std=10,
reservation_price_noise_std=None,
discount_trigger_pct=0.35, # trigger threshold: 0.5 means 50% lower
boosted_rate_factor = 10,
hit_rate_target=0.4,
window_size=10,
attrition_threshold=0.1,
random_seed=42):
"""
Simulate RFQ interactions with Bayesian quantile pricing and client attrition.
Adds a mechanism: if a client gets a quote at least `discount_trigger_pct` below their reservation price,
they double their RFQ rate until the dealer quotes above their reservation price.
"""
np.random.seed(random_seed)
# Generate distribution of reservation prices
reservation_prices = np.random.normal(reservation_price_mean,
reservation_price_mean * reservation_price_std_pct,
N_clients)
# Noise in reservation prices
fair_price = reservation_price_mean
res_price_noise_std = reservation_price_noise_std or (fair_price * reservation_price_std_pct * 2)
res_price_noise_var = res_price_noise_std ** 2
# Base RfQ intensities
lambdas = np.abs(np.random.normal(lambda_mean, lambda_std, N_clients))
base_lambdas = lambdas.copy() # store for reset later
# Posterior beliefs
clients_posterior = {i: {'mean': prior_mean, 'var': prior_std ** 2,
'n': 0, 'sum_obs': 0.0}
for i in range(N_clients)}
z = norm.ppf(1 - hit_rate_target)
active = np.ones(N_clients, dtype=bool)
# Flags for tracking "boosted" mode
boosted = np.zeros(N_clients, dtype=bool)
# Track daily active/boosted status
active_flags = np.zeros((T, N_clients), dtype=bool)
boosted_flags = np.zeros((T, N_clients), dtype=bool)
daily_history = [deque(maxlen=window_size) for _ in range(N_clients)]
active_history = []
records = []
Y = np.zeros((T, N_clients), dtype=int) # binary: whether client sent at least one RFQ that day
for t in range(T):
daily_hits = np.zeros(N_clients, dtype=int)
daily_reqs = np.zeros(N_clients, dtype=int)
for i in range(N_clients):
if not active[i]:
continue
n_rfq = poisson.rvs(lambdas[i])
if n_rfq > 0:
Y[t, i] = 1
for _ in range(n_rfq):
post = clients_posterior[i]
price = max(0.0, post['mean'] + np.sqrt(post['var'] + res_price_noise_var) * z)
r = norm.rvs(reservation_prices[i], res_price_noise_std)
# Trading happens when price <= reservation price
hit = price <= r
daily_reqs[i] += 1
daily_hits[i] += int(hit)
# Check discount trigger
if price <= (1 - discount_trigger_pct) * r:
boosted[i] = True
lambdas[i] = base_lambdas[i] * boosted_rate_factor
elif boosted[i] and price > r / (1-discount_trigger_pct):
boosted[i] = False
lambdas[i] = base_lambdas[i]
# Update posterior belief
post['n'] += 1
post['sum_obs'] += r
post_var = 1 / (1 / (prior_std ** 2) + post['n'] / res_price_noise_var)
post['var'] = post_var
post['mean'] = post_var * (prior_mean / (prior_std ** 2) +
post['sum_obs'] / res_price_noise_var)
records.append({'time': t,
'client_id': i,
'price': price,
'reservation': r,
'hit': hit,
'boosted': boosted[i]})
# Update attrition status
for i in range(N_clients):
if not active[i] or daily_reqs[i] == 0:
continue
rate = daily_hits[i] / daily_reqs[i]
daily_history[i].append(rate)
if len(daily_history[i]) == window_size and np.mean(daily_history[i]) < attrition_threshold:
active[i] = False
active_history.append(int(active.sum()))
active_flags[t, :] = active.copy()
boosted_flags[t, :] = boosted.copy()
rfq_df = pd.DataFrame(records)
activity_df = pd.DataFrame(Y, columns=[f'client_{i}' for i in range(N_clients)])
active_df = pd.DataFrame(active_flags, columns=[f'client_{i}' for i in range(N_clients)])
boosted_df = pd.DataFrame(boosted_flags, columns=[f'client_{i}' for i in range(N_clients)])
return rfq_df, active_history, activity_df, active_df, boosted_df
# Run simulation
rfq_df, active_history, activity_df, active_df, boosted_df = run_simulation()
# Plot active and boosted clients over time
plt.figure(figsize=(10,5))
plt.plot(active_df.sum(axis=1), label='Active Clients')
plt.plot(boosted_df.sum(axis=1), label='Boosted Clients', linestyle='--')
plt.xlabel('Day')
plt.ylabel('Number of Clients')
plt.title('Active vs Boosted Clients Over Time')
plt.legend()
plt.grid(True)
plt.show()
# Heatmap of boosted status
plt.figure(figsize=(12,6))
plt.imshow(boosted_df.T, aspect='auto', cmap='Reds')
plt.colorbar(label='Boosted Status (1=Yes, 0=No)')
plt.xlabel('Day')
plt.ylabel('Client ID')
plt.title('Boosted Clients Over Time')
plt.show()


from scipy.special import betaln, gammaln
from scipy.optimize import minimize
# -----------------------------
# Beta-Binomial mixture MLE with shared mean
# -----------------------------
def log_beta_binom_pmf(x, n, alpha, beta):
# log [ C(n,x) * B(alpha+x, beta+n-x) / B(alpha,beta) ]
return (
gammaln(n + 1) - gammaln(x + 1) - gammaln(n - x + 1)
+ betaln(alpha + x, beta + n - x)
- betaln(alpha, beta)
)
def neg_log_likelihood(params, xs, ns):
# params = [logit_mu, log_k_g, log_k_b, logit_qg]
logit_mu, log_k_g, log_k_b, logit_qg = params
mu = 1 / (1 + np.exp(-logit_mu)) # shared mean
k_g = np.exp(log_k_g) + 1e-8
k_b = np.exp(log_k_b) + 1e-8
qg = 1 / (1 + np.exp(-logit_qg))
# convert to alpha, beta
alpha_g, beta_g = mu * k_g, (1 - mu) * k_g
alpha_b, beta_b = mu * k_b, (1 - mu) * k_b
# mixture likelihood per client
ll = []
for x, n in zip(xs, ns):
log_p_g = log_beta_binom_pmf(x, n, alpha_g, beta_g)
log_p_b = log_beta_binom_pmf(x, n, alpha_b, beta_b)
# log-sum-exp for mixture
a = np.log(qg) + log_p_g
b = np.log(1 - qg) + log_p_b
m = max(a, b)
ll_i = m + np.log(np.exp(a - m) + np.exp(b - m))
ll.append(ll_i)
return -np.sum(ll)
def fit_beta_mixture_mle(activity_df, train_days):
"""
activity_df: (T x N) binary DataFrame
train_days: number of days used for training (first half)
Returns dict with fitted parameters.
"""
T, N = activity_df.shape
A = activity_df.iloc[:train_days].values # (train_days x N)
xs = A.sum(axis=0) # successes per client in training
ns = np.full_like(xs, fill_value=train_days)
# rough start for mean
p_hat = xs.mean() / train_days
logit_mu0 = np.log(p_hat / (1 - p_hat + 1e-8))
start = np.array([
logit_mu0,
np.log(10.0), # k_g initial concentration
np.log(2.0), # k_b initial concentration
0.0 # logit(0.5)
], dtype=float)
res = minimize(neg_log_likelihood, start, args=(xs, ns), method='L-BFGS-B')
logit_mu, log_k_g, log_k_b, logit_qg = res.x
mu = 1 / (1 + np.exp(-logit_mu))
k_g = np.exp(log_k_g) + 1e-8
k_b = np.exp(log_k_b) + 1e-8
params = {
'mu': float(mu),
'alpha_g': float(mu * k_g), 'beta_g': float((1 - mu) * k_g),
'alpha_b': float(mu * k_b), 'beta_b': float((1 - mu) * k_b),
'q_g': float(1 / (1 + np.exp(-logit_qg))),
'success': bool(res.success),
'message': res.message
}
return params
def posterior_good(x, n, params):
a_g = params['alpha_g']; b_g = params['beta_g']
a_b = params['alpha_b']; b_b = params['beta_b']
q_g = params['q_g']
log_p_g = log_beta_binom_pmf(x, n, a_g, b_g)
log_p_b = log_beta_binom_pmf(x, n, a_b, b_b)
a = np.log(q_g) + log_p_g
b = np.log(1 - q_g) + log_p_b
m = max(a, b)
num = np.exp(a - m)
den = num + np.exp(b - m)
return float(num / den)
# Fit on first half, removing first 25 days until it settles
T = activity_df.shape[0]
mid = T // 2
params = fit_beta_mixture_mle(activity_df[25:], train_days=mid)
# Detect abnormalities on second half using a sliding window
n_window = 10 # window size for detection, can be adjusted
threshold = 0.5 # classify as abnormal if p(good|D) < threshold
N = activity_df.shape[1]
abnormal_flags = np.zeros_like(activity_df.values, dtype=bool)
# For each day t in the second half, use the last n_window days (bounded within the second half)
for t in range(mid, T):
start = max(mid, t - n_window + 1)
end = t + 1
window = activity_df.iloc[start:end].values # (w x N)
n = window.shape[0]
x = window.sum(axis=0)
# Posterior for each client
p_good = np.array([posterior_good(int(xi), int(n), params) for xi in x])
abnormal_flags[t, :] = p_good < threshold
abnormal_df = pd.DataFrame(abnormal_flags, columns=activity_df.columns, index=activity_df.index)
params
{'mu': 0.6459024041525655,
'alpha_g': 17819.14219878137,
'beta_g': 9768.837168102045,
'alpha_b': 0.6772910514791353,
'beta_b': 0.3713055277018204,
'q_g': 0.892871535754001,
'success': True,
'message': 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'}
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import beta
def plot_fitted_betas(params, bins=200):
"""
Plot fitted good vs bad Beta distributions.
params: dict returned by fit_beta_mixture_mle
"""
a_g, b_g = params['alpha_g'], params['beta_g']
a_b, b_b = params['alpha_b'], params['beta_b']
q_g = params['q_g']
# Grid
x = np.linspace(0, 1, bins)
# Densities
pdf_g = beta.pdf(x, a_g, b_g)
pdf_b = beta.pdf(x, a_b, b_b)
plt.figure(figsize=(10, 6))
plt.plot(x, pdf_g, label=f"Good (α={a_g:.2f}, β={b_g:.2f})", lw=2, color="blue")
plt.plot(x, pdf_b, label=f"Bad (α={a_b:.2f}, β={b_b:.2f})", lw=2, color="red")
plt.fill_between(x, pdf_g, alpha=0.2, color="blue")
plt.fill_between(x, pdf_b, alpha=0.2, color="red")
plt.title("Fitted Beta Distributions (Good vs Bad)")
plt.xlabel("Success probability")
plt.ylabel("Density")
plt.legend()
plt.grid(True, ls="--", alpha=0.5)
plt.show()
plot_fitted_betas(params)

# Use a previous window (exclude day t) when computing P(good|D)
n_window = 10
threshold = 0.5
def build_pgood_panel(activity_df, boosted_df, active_df, params, mid, n_window, threshold):
T, N = activity_df.shape
rows = []
# Start at mid + n_window to ensure a full *previous* window exists
for t in range(mid + n_window, T):
start = t - n_window
end = t # exclude day t
window = activity_df.iloc[start:end] # (n_window x N)
x = window.sum(axis=0).values.astype(int) # successes per client in previous window
n = n_window
# compute p_good for each client
p_goods = []
for j in range(N):
p = posterior_good(int(x[j]), int(n), params)
p_goods.append(p)
p_goods = np.array(p_goods)
# flags at time t
boosted_t = boosted_df.iloc[t].values.astype(bool)
active_t = active_df.iloc[t].values.astype(bool)
inactive_t = ~active_t
traded_today = activity_df.iloc[t].values.astype(int)
for j in range(N):
rows.append({
'day': t,
'client_id': j,
'x': int(x[j]),
'n': int(n),
'p_good': float(p_goods[j]),
'abnormal': bool(p_goods[j] < threshold),
'boosted': bool(boosted_t[j]),
'inactive': bool(inactive_t[j]),
'traded_today': int(traded_today[j])
})
panel = pd.DataFrame(rows)
return panel
panel_df = build_pgood_panel(activity_df, boosted_df, active_df, params, mid, n_window, threshold)
# Correctly aligned daily summary for second half *with windowing*
days = sorted(panel_df['day'].unique())
daily_summary = panel_df.groupby('day').agg(
abnormal_count=('abnormal', 'sum'),
boosted_count=('boosted', 'sum'),
inactive_count=('inactive', 'sum')
).reset_index()
# Overlap summary on client-day panel
overlap_summary = {
'days_evaluated': int(len(days)),
'clients': int(N),
'total_client_days': int(panel_df.shape[0]),
'abnormal_client_days': int(panel_df['abnormal'].sum()),
'boosted_client_days': int(panel_df['boosted'].sum()),
'inactive_client_days': int(panel_df['inactive'].sum()),
'abnormal_and_boosted': int((panel_df['abnormal'] & panel_df['boosted']).sum()),
'abnormal_and_inactive': int((panel_df['abnormal'] & panel_df['inactive']).sum()),
'abnormal_not_boosted_or_inactive': int((panel_df['abnormal'] & ~panel_df['boosted'] & ~panel_df['inactive']).sum())
}
overlap_df2 = pd.DataFrame([overlap_summary])
# Heatmap of p_good by client (days x clients) for visual inspection
# Build a matrix with rows=days, cols=clients
days_idx = daily_summary['day'].values
pgood_mat = np.full((len(days_idx), N), np.nan)
for i, t in enumerate(days_idx):
slice_t = panel_df[panel_df['day'] == t].sort_values('client_id')
pgood_mat[i, :] = slice_t['p_good'].values
plt.figure(figsize=(12,6))
plt.imshow(pgood_mat.T, aspect='auto')
plt.colorbar(label='P(good | previous window)')
plt.xlabel('Day index in second half')
plt.ylabel('Client ID')
plt.title('Per-client P(good|D) over time (previous window)')
plt.show()

from matplotlib.lines import Line2D
def plot_clients_activity_prob(clients, df):
"""
One figure with one row per client.
On boosted days, only the 'Boosted' marker is shown (others hidden).
"""
clients = list(clients)
n = len(clients)
if n == 0:
return
fig, axes = plt.subplots(n, 1, figsize=(12, 2.8 * n), sharex=True, constrained_layout=True)
if n == 1:
axes = [axes]
# Build a single, global legend
legend_elems = [
Line2D([0],[0], marker='s', linestyle='None', color='blue', alpha=0.3, label='Active'),
Line2D([0],[0], marker='x', linestyle='None', color='gray', alpha=0.3, label='Inactive'),
Line2D([0],[0], marker='o', linestyle='None', color='red', alpha=0.5, label='Traded'),
Line2D([0],[0], marker='^', linestyle='None', color='green', alpha=0.7, label='Boosted'),
Line2D([0],[0], linestyle='-', color='black', label='P(good|D)'),
]
x_min, x_max = None, None
for ax, cid in zip(axes, clients):
df_c = df[df['client_id'] == cid].sort_values('day').copy()
if df_c.empty:
ax.set_title(f"Client {cid} (no data)")
ax.set_yticks([])
continue
# Masks
boosted_mask = df_c['boosted']
active_mask = (~df_c['inactive']) & (~boosted_mask)
inactive_mask = df_c['inactive'] & (~boosted_mask)
traded_mask = (df_c['traded_today'] == 1) & (~boosted_mask)
# Scatter statuses
ax.scatter(df_c['day'][active_mask], [1]*active_mask.sum(), c='blue', marker='s', alpha=0.3)
ax.scatter(df_c['day'][inactive_mask], [1]*inactive_mask.sum(), c='gray', marker='x', alpha=0.3)
ax.scatter(df_c['day'][traded_mask], [1]*traded_mask.sum(), c='red', marker='o', alpha=0.5)
ax.scatter(df_c['day'][boosted_mask], [1]*boosted_mask.sum(), c='green', marker='^', alpha=0.7, zorder=3)
ax.set_yticks([])
ax.set_ylim(0.5, 1.5)
# p_good
ax2 = ax.twinx()
ax2.plot(df_c['day'], df_c['p_good'], color='black', lw=2)
ax2.set_ylim(0, 1)
ax2.set_ylabel("P(good|D)")
ax.set_title(f"Client {cid}")
# Track x-lims
dmin, dmax = df_c['day'].min(), df_c['day'].max()
x_min = dmin if x_min is None else min(x_min, dmin)
x_max = dmax if x_max is None else max(x_max, dmax)
axes[-1].set_xlabel("Day")
if x_min is not None and x_max is not None:
axes[-1].set_xlim(x_min, x_max)
fig.suptitle("Selected clients activity", y=1.02)
# Move legend below figure
fig.legend(handles=legend_elems, loc='lower center', ncol=5, frameon=False, bbox_to_anchor=(0.5, -0.03))
plt.show()
plot_clients_activity_prob([30,31,35, 37], panel_df)

from sklearn.metrics import confusion_matrix
def _cm_df(cm, labels=("Pred 0", "Pred 1"), index=("True 0", "True 1")):
"""Pretty dataframe for a 2x2 confusion matrix."""
return pd.DataFrame(cm, index=index, columns=labels)
def evaluate_confusions(panel_df, normalize=False):
"""
Evaluate confusion matrices without recomputing thresholds.
Uses:
- df['abnormal'] as the model's predicted abnormal label (already thresholded).
- df['inactive'] and df['boosted'] as operational labels.
- Global abnormality = (inactive OR boosted).
Returns dict with both raw arrays and pretty DataFrames.
Set normalize=True to return per-true-class rates instead of counts.
"""
df = panel_df.copy()
# Ensure ints
y_abnormal = df["abnormal"].astype(int).values
y_inactive = df["inactive"].astype(int).values
y_boosted = df["boosted"].astype(int).values
# Global abnormality (prediction-side for the overall CM)
y_global_abn = (df["inactive"] | df["boosted"]).astype(int).values
# Normalization mode for sklearn
norm_mode = "true" if normalize else None
results = {}
# 1) Inactive (truth) vs Abnormal (model)
cm_inactive = confusion_matrix(y_inactive, y_abnormal, labels=[0,1], normalize=norm_mode)
results["inactive_vs_abnormal"] = cm_inactive
results["inactive_vs_abnormal_df"] = _cm_df(cm_inactive)
# 2) Boosted (truth) vs Abnormal (model)
cm_boosted = confusion_matrix(y_boosted, y_abnormal, labels=[0,1], normalize=norm_mode)
results["boosted_vs_abnormal"] = cm_boosted
results["boosted_vs_abnormal_df"] = _cm_df(cm_boosted)
# 3) OVERALL: Abnormal (truth) vs Global abnormality (prediction = inactive OR boosted)
cm_overall = confusion_matrix(y_abnormal, y_global_abn, labels=[0,1], normalize=norm_mode)
results["overall_abnormal_vs_global"] = cm_overall
results["overall_abnormal_vs_global_df"] = _cm_df(cm_overall)
return results
# Example usage:
cm_results = evaluate_confusions(panel_df, normalize=False)
print("Inactive (truth) vs Abnormal (model):\n", cm_results["inactive_vs_abnormal_df"], "\n")
print("Boosted (truth) vs Abnormal (model):\n", cm_results["boosted_vs_abnormal_df"], "\n")
print("Overall: Abnormal (truth) vs Global (inactive OR boosted) prediction:\n", cm_results["overall_abnormal_vs_global_df"])
Inactive (truth) vs Abnormal (model):
Pred 0 Pred 1
True 0 3830 74
True 1 56 540
Boosted (truth) vs Abnormal (model):
Pred 0 Pred 1
True 0 3867 609
True 1 19 5
Overall: Abnormal (truth) vs Global (inactive OR boosted) prediction:
Pred 0 Pred 1
True 0 3811 75
True 1 69 545