Note0716 (mediation analysis with pymc)

Author

Tsai JW

Published

July 16, 2023

import pymc as pm
import numpy as np
import pandas as pd
import arviz as az
import matplotlib.pyplot as plt
#dat = pd.read_csv('data1463_fin3.csv')

為了之後還可以跑這個模型,根據原本的資料重新模擬一筆資料。

(但這邊沒考慮到原本數值之間的相關情形) 已考慮進去!!用多元常態分配模擬了。

'''
[dat.acd1_eap.mean(), dat.scl_eap.mean(),dat.c1.mean(),dat.c2.mean(),dat.c3.mean()]
np.cov([dat.acd1_eap, dat.scl_eap, dat.c1, dat.c2, dat.c3])
'''
'\n[dat.acd1_eap.mean(), dat.scl_eap.mean(),dat.c1.mean(),dat.c2.mean(),dat.c3.mean()]\nnp.cov([dat.acd1_eap, dat.scl_eap, dat.c1, dat.c2, dat.c3])\n'
dat_mn = np.array(
    [2.8510193121856177e-05,
     1.9250494837216173e-06,
     0.19822282980177716,
     0.11483253588516747,
     0.11551606288448393]
)
dat_cov = np.array([
    [ 0.7959285 ,  0.16304568, -0.01541732, -0.01689872, -0.02973076],
    [ 0.16304568,  0.51185381, -0.05390401, -0.00531492, -0.02327021],
    [-0.01541732, -0.05390401,  0.15903925, -0.022778  , -0.02291358],
    [-0.01689872, -0.00531492, -0.022778  ,  0.10171555, -0.01327408],
    [-0.02973076, -0.02327021, -0.02291358, -0.01327408,  0.10224199]
])
dat = np.random.multivariate_normal(dat_mn, dat_cov, 1000)
dat = pd.DataFrame(dat, columns=['acd1_eap', 'scl_eap', 'c1', 'c2', 'c3'])
dat['c1'] = dat['c1'] > 0.5
dat['c2'] = dat['c2'] > 0.5
dat['c3'] = dat['c3'] > 0.5
dat
acd1_eap scl_eap c1 c2 c3
0 0.166286 0.234901 False False False
1 -1.781805 0.507308 False False False
2 1.508147 0.059536 False False False
3 -0.357505 -0.018872 True False False
4 -0.104632 -0.378684 False False True
... ... ... ... ... ...
995 -0.268559 0.530033 False True False
996 -0.802137 0.281370 True False False
997 0.210101 -0.957152 False False True
998 -0.551066 -0.537315 False False False
999 -1.006819 0.682656 False False False

1000 rows × 5 columns

'''
dat_dict = {
    'acd1_eap': np.random.normal(loc=2.8510193121856177e-05, scale=0.8921482479092293, size=1000),
    'scl_eap': np.random.normal(loc=1.9250494837216173e-06, scale=0.7154395945457549, size=1000),
    'c1': np.random.binomial(n=1, p=0.19822282980177716, size=1000),
    'c2': np.random.binomial(n=1, p=0.11483253588516747, size=1000),
    'c3': np.random.binomial(n=1, p=0.11551606288448393, size=1000),    
}
dat = pd.DataFrame(dat_dict)
'''
"\ndat_dict = {\n    'acd1_eap': np.random.normal(loc=2.8510193121856177e-05, scale=0.8921482479092293, size=1000),\n    'scl_eap': np.random.normal(loc=1.9250494837216173e-06, scale=0.7154395945457549, size=1000),\n    'c1': np.random.binomial(n=1, p=0.19822282980177716, size=1000),\n    'c2': np.random.binomial(n=1, p=0.11483253588516747, size=1000),\n    'c3': np.random.binomial(n=1, p=0.11551606288448393, size=1000),    \n}\ndat = pd.DataFrame(dat_dict)\n"
with pm.Model() as model:
    acd1eap = pm.ConstantData('acd1eap', dat.acd1_eap)
    scleap = pm.ConstantData('scleap', dat.scl_eap)
    c1 = pm.ConstantData('c1', dat.c1)
    c2 = pm.ConstantData('c2', dat.c2)
    c3 = pm.ConstantData('c3', dat.c3)

    # intercept
    acd1eap_Intercept = pm.Normal('acd1eap_Intercept', mu=0, sigma=100)
    scleap_Intercept = pm.Normal('scleap_Intercept', mu=0, sigma=100)
    
    # noise
    acd1eap_Sigma = pm.HalfCauchy("acd1eap_Sigma", 1)
    scleap_Sigma = pm.HalfCauchy("scleap_Sigma", 1)

    # slope
    acd1eap_scleap = pm.Normal('acd1eap_scleap', mu=0, sigma=100)
    acd1eap_c1 = pm.Normal('acd1eap_c1', mu=0, sigma=100)
    acd1eap_c2 = pm.Normal('acd1eap_c2', mu=0, sigma=100)
    acd1eap_c3 = pm.Normal('acd1eap_c3', mu=0, sigma=100)
    scleap_c1 = pm.Normal('scleap_c1', mu=0, sigma=100)
    scleap_c2 = pm.Normal('scleap_c2', mu=0, sigma=100)
    scleap_c3 = pm.Normal('scleap_c3', mu=0, sigma=100)

    # likelihood
    pm.Normal("y_likelihood", mu=acd1eap_Intercept + acd1eap_scleap * scleap  + acd1eap_c1 * c1 + acd1eap_c2 * c2 + acd1eap_c3 * c3, sigma =  acd1eap_Sigma, observed = acd1eap  )
    pm.Normal('m_likelihood', mu=scleap_Intercept + scleap_c1 * c1 + scleap_c2 * c2 + scleap_c3 * c3, sigma = scleap_Sigma, observed = scleap)
    
    trace_med = pm.sample(2000, chains=4, cores=4)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [acd1eap_Intercept, scleap_Intercept, acd1eap_Sigma, scleap_Sigma, acd1eap_scleap, acd1eap_c1, acd1eap_c2, acd1eap_c3, scleap_c1, scleap_c2, scleap_c3]
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 4 seconds.
100.00% [12000/12000 00:03<00:00 Sampling 4 chains, 0 divergences]
pm.model_to_graphviz(model)

az.summary(trace_med)
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
acd1eap_Intercept 0.058 0.035 -0.007 0.123 0.000 0.000 9706.0 6274.0 1.0
scleap_Intercept 0.089 0.029 0.031 0.142 0.000 0.000 9752.0 6676.0 1.0
acd1eap_scleap 0.328 0.037 0.260 0.398 0.000 0.000 12934.0 6525.0 1.0
acd1eap_c1 -0.000 0.066 -0.120 0.128 0.001 0.001 11839.0 6027.0 1.0
acd1eap_c2 -0.125 0.082 -0.282 0.029 0.001 0.001 12185.0 6373.0 1.0
acd1eap_c3 -0.149 0.086 -0.313 0.009 0.001 0.001 11086.0 5530.0 1.0
scleap_c1 -0.342 0.056 -0.444 -0.234 0.001 0.000 10387.0 6201.0 1.0
scleap_c2 -0.039 0.070 -0.165 0.096 0.001 0.001 12429.0 6258.0 1.0
scleap_c3 -0.221 0.073 -0.355 -0.080 0.001 0.000 12738.0 6714.0 1.0
acd1eap_Sigma 0.850 0.019 0.814 0.885 0.000 0.000 14507.0 5681.0 1.0
scleap_Sigma 0.722 0.016 0.691 0.752 0.000 0.000 11896.0 5736.0 1.0