Applying
In the
Sometimes it’s more convenient to work with summaries of subgroups in the data than with the raw data itself
https://en.wikipedia.org/wiki/Sufficient_statistic https://web.ma.utexas.edu/users/gordanz/notes/likelihood_color.pdf
https://www.statsmodels.org/devel/generated/statsmodels.genmod.generalized_linear_model.GLM.html
import pandas as pd
from statsmodels.api import formula as smf
from statsmodels import api as sm
df_long = pd.DataFrame({'x': [0] * 100 + [1]*100, 'y':[0] * 95 + [1]*5 + [0] * 5 + [1]*95})
long_model = smf.logit('y ~ x', df_long)
long_fit = long_model.fit()
print(long_fit.summary())
df_short = pd.DataFrame({'x': [0, 0, 1, 1], 'y':[1, 0, 1, 0]})
n_trials = pd.Series([5, 95, 95, 5])
short_model = smf.glm('y ~ x', df_short, n_trials=n_trials, family=sm.families.Binomial())
short_fit = short_model.fit()
print(short_fit.summary())
WLS implementation in statsmodels doesn’t work here? Try GLM https://www.statsmodels.org/devel/examples/notebooks/generated/glm_weights.html
from scipy.optimize import curve_fit
import numpy as np
from scipy.stats import norm
import pandas as pd
from statsmodels.api import formula as smf
from statsmodels import api as sm
n_per_group = 1000
df = pd.DataFrame({'x': [0] * n_per_group + [1] * n_per_group, 'y': np.concatenate((norm(1, 4).rvs(n_per_group), norm(3, 4).rvs(n_per_group)))})
def summarize(df, x_cols, y_col):
X_summary = []
y_summary = []
se_summary = []
for X_values, group_df in df.groupby(x_cols):
X_summary.append(X_values)
y_summary.append(np.mean(group_df[y_col]))
se_summary.append(np.std(group_df[y_col]) / np.sqrt(len(group_df)))
X_summary = pd.DataFrame(X_summary, columns=x_cols)
y_summary = np.array(y_summary)
se_summary = np.array(se_summary)
return X_summary, y_summary, se_summary
X_summary, y_summary, se_summary = summarize(df, ['x'], 'y')
def ols_summaries(X, y, se):
def f(X, *v):
a, b = v[0], v[1:]
return a + np.dot(X, b)
r, c = X.shape
params, cov = curve_fit(f, X, y, sigma=se, absolute_sigma=True, p0=np.ones(c+1))
params_se = np.sqrt(np.diag(cov))
return params, params_se
p_summary, p_se_summary = ols_summaries(X_summary, y_summary, se_summary)
print(p_summary, p_se_summary)
print(smf.ols('y ~ x', df).fit().summary())
print(sm.WLS(y_summary, sm.add_constant(X_summary), 1./se_summary**2).fit().summary())
https://en.wikipedia.org/wiki/Sufficient_statistic#Normal_distribution
\[\mathcal{L}(\mu, \sigma | \bar{y}, s^2, n) = (2 \pi \sigma^2)^{\frac{n}{2}} exp \left(\frac{n - 1}{2 \sigma^2} s^2 \right ) exp \left (-\frac{n}{2 \sigma^2} (\mu - \bar{y})^2 \right)\] \[ln \mathcal{L}(\mu, \sigma | \bar{y}, s^2, n) = -\frac{n}{2} ln(2 \pi \sigma^2) - \left( \frac{n-1}{2 \sigma^2}s^2 - \frac{n}{2\sigma^2} (\mu - \bar{y})^2 \right)\]import numpy as np
from scipy.optimize import minimize
def logpdf_sufficient(mu, sigma_sq, sample_mean, sample_var, n):
return -(n/2) * np.log(2*np.pi*sigma_sq) - (((n-1) / (2*sigma_sq)) * sample_var) - ((n / (2*sigma_sq)) * (mu - sample_mean)**2)
data = np.random.normal(0, 1, 1000)
n = len(data)
m, v = np.mean(data), np.var(data)
def neg_log_likelihood(p):
return -logpdf_sufficient(p[0], np.exp(p[1]), m, v, n)
result = minimize(neg_log_likelihood, np.array([5, -1]))
cov = result.hess_inv
se_mean = np.sqrt(cov[0][0])