(cache)カイカムリのスポンジ選択で学ぶ階層ベイズモデリング

このプレプリントで、カイカムリにサイズの異なる三つのスポンジを与えて、どのサイズを気に入るかテストしています。その解析部分をここで解説してみます。それぞれの選択肢を選ぶ「強さ」が身体の大きさと脚の欠損度合いの影響をどれくらい受けるのかを推定してみようと思います。また、この選択行動に個体差があると考えてそれがどれくらいか推定してみます。松浦氏のStanとRでベイズ統計モデリング、階層ベイズモデルとWAICに多くを負っています。

データ

CSV file for Harada and Kagaya, https://doi.org/10.1101/330787 · GitHub

モデル構成

Sサイズを選んだ個体はいなかったので、MとLサイズを選んだ場合と、いずれも選ばなかった場合を考えます。Mの選択を基準とした、ソフトマックス回帰を行います。また、Ｍと無選択の両方を示した個体は1個体だけだったため、個体差はＬの選択のみ考えます。甲幅Cwidth、脚の欠損度合いがLegLackです。

$μ [n, 1] = 0$
$μ [n, 2] = a_{c h o i c e_{L}} [I D [n]] + b_{c h o i c e_{L}} * C w i d t h [n] + c_{c h o i c e_{L}} * L e g L a c k [n]$
$μ [n, 3] = d_{c h o i c e_{0}} [n] + e_{c h o i c e_{0}} * C w i d t h [n] + f_{c h o i c e_{0}} * L e g L a c k [n]$
$n = 1, . . ., N_{a c t}$

$a_{c h o i c e_{L}} [k] \sim N o r m a l (a_{c h o i c e_{L 0}}, a_{c h o i c e_{L s}})$
$k = 1, . . ., N_{a n i m a l}$

$C h o i c e [n] \sim C a t e g o r i c a l (s o f t m a x (μ [n,])), n = 1, . . ., N_{a c t}$

Stanコード

      
        functions {
      
          real f(real mu, real s, real x){
      
              return(normal_lpdf(x | mu, s));
      
              }
      
          real log_lik_Simpson(real mu, real s, real a, real b, int M) {
      
            vector[M+1] lp;
      
            real h;
      
            h = (b-a)/M;
      
            lp[1] = f(mu, s, a);
      
            for (m in 1:(M/2))
      
              lp[2*m] = log(4) + f(mu, s, a + h*(2*m-1));
      
            for (m in 1:(M/2-1))
      
              lp[2*m+1] = log(2) + f(mu, s,  a + h*2*m);
      
            lp[M+1] = f(mu, s, b);
      
            return(log(h/3) + log_sum_exp(lp));
      
          }
      
          real f2(int Y, real cwl, real cwno, real ll, real lno, real C_width, real Leg_lack,
      
                  real bias_no0,
      
                  real bias_l_K){
      
            vector[3] mu;
      
            mu[1] = 0;
      
            mu[2] = bias_l_K + cwl*C_width + ll*Leg_lack;
      
            mu[3] = bias_no0 + cwno*C_width + lno*Leg_lack; 
      
            return(categorical_logit_lpmf(Y | mu));
      
          }
      
          real log_lik_Simpson_rest(int Y, real cwl, real cwno, real ll, real lno, real C_width, real Leg_lack,
      
                                    real bias_no0,
      
                                    real bias_l_lower, real bias_l_upper,
      
                                    int M){
      
            vector[M+1] lp;
      
            real h;
      
            h = (bias_l_upper - bias_l_lower)/M;
      
            lp[1] = f2(Y, cwl, cwno, ll, lno, C_width, Leg_lack,
      
                        bias_no0,
      
                        bias_l_lower);
      
            for (m in 1:(M/2))
      
              lp[2*m] = log(4) + f2(Y, cwl, cwno, ll, lno, C_width, Leg_lack,
      
                                    bias_no0,
      
                                    bias_l_lower + h*(2*m-1));
      
            for (m in 1:(M/2-1))
      
              lp[2*m+1] = log(2) + f2(Y, cwl, cwno, ll, lno, C_width, Leg_lack,
      
                                      bias_no0,
      
                                      bias_l_lower + h*2*m);
      
            lp[M+1] = f2(Y, cwl, cwno, ll, lno, C_width, Leg_lack,
      
                          bias_no0,
      
                          bias_l_upper);
      
            return(log(h/3) + log_sum_exp(lp));
      
          }
      
        }
      
        data {
      
            int N;
      
            int K;
      
            int L; 
      
            int<lower=1, upper=K> Y[N];
      
            real C_width[N];
      
            int Leg_lack[N];
      
            int<lower=1, upper=L> ID[N];
      
        }
      
        parameters {
      
            real bias_l[L];
      
            real bias_l0;
      
            real ll;
      
            real cwl;
      
            real<lower=0> s_bias_l;
      
            real cwno;
      
            real bias_no0;
      
            real lno;
      
        }
      
        transformed parameters {
      
            matrix[N,K] mu;
      
            for (n in 1:N){
      
                mu[n,1] = 0;
      
                mu[n,2] =  bias_l[ID[n]] + cwl*C_width[n] + ll*Leg_lack[n];
      
                mu[n,3] =  bias_no0 + cwno*C_width[n] + lno*Leg_lack[n]; 
      
            }
      
        }
      
        model {
      
            for (l in 1:L){
      
                bias_l[l] ~ normal(bias_l0, s_bias_l);
      
            }
      
            for (n in 1:N){
      
                Y[n] ~ categorical_logit(mu[n,]');
      
            }
      
        }
      
        generated quantities {
      
          vector[N] log_lik;
      
          real log_lik_l;
      
          // for making computing fast, separately compute loglik unrelated to Y[n] and other parameters
      
          log_lik_l = log_lik_Simpson(bias_l0, s_bias_l, 
      
                                      bias_l0 - 5*s_bias_l, bias_l0 + 5*s_bias_l, 100);
      
          for (n in 1:N){
      
            log_lik[n] = log_lik_l + 
      
                         log_lik_Simpson_rest(Y[n], 
      
                                              cwl, cwno, ll, lno, C_width[n], Leg_lack[n],
      
                                              bias_no0,
      
                                              bias_l0 - 5*s_bias_l, bias_l0 + 5*s_bias_l,
      
                                              100); 
      
          }
      
        }

view raw model_1_1_choice.stan hosted with ❤ by GitHub

gist.github.com

generated quantities ブロックで、functionsブロックで定義されている関数を実行し、モデルの予測力をWAICを用いて比較するための計算を行っています。ここでは、予測分布としては、新たな個体の新たな選択の予測を考えるのが適当ですから、個体差を表わすために導入されているローカルなパラメータ( $a_{c h o i c e_{L}}$ 、コード内では bias_l[l]) を積分しています。高速化するために、階層ベイズモデルとWAICを参考にして、分けて計算をしています。

Stanを実行するRコード

      
        library(rstan)
      
        library(tidyverse)
      
        library(shinystan)
      
        rstan_options(auto_write = TRUE)
      
        options(mc.cores = parallel::detectCores())
      
        # preprocessing                     
      
        d <- read_csv(file='Dromiidae.csv') # https://gist.github.com/kagaya/0d309397300b7e8294fe53c44b6fc525
      
        d <- d %>%
      
          select(c('shell_width', 'choice',
      
                   #'whole_px', 'cm_per_px',
      
                   'id', 'leg_lack'           ))
      
        d <- d[complete.cases(d), ]
      
        # renumber animal id from 1
      
        id <- as.factor(d$id)
      
        levels(id) <- as.character(1:length(levels(id)))
      
        d$id <- as.integer(id)
      
        d$choice <- as.factor(d$choice)
      
        levels(d$choice) <- c(2,1,3) # label M,L,No, as 1,2,3
      
        d$choice <- as.integer(as.character(d$choice))
      
        d$leg_lack <- as.factor(d$leg_lack)
      
        levels(d$leg_lack) <- c(2,3,1) # label no_leg_lack, A, C, as 1, 2, 3
      
        d$leg_lack <- as.integer(as.character(d$leg_lack))
      
        data <- list(N=nrow(d), 
      
                     K=max(d$choice),
      
                     L=max(d$id),
      
                     Y=d$choice,
      
                     #cap_ex_size=d$cap_ex_size,
      
                     C_width=d$shell_width,
      
                     Leg_lack=d$leg_lack,
      
                     ID=d$id)
      
        fit <- stan(file='model_1_1_choice.stan', ###
      
                    data=data, seed=1234,
      
                    chains=4, iter=6000, warmup=1000, thin=1)

view raw kick_model_1_1_choice.R hosted with ❤ by GitHub

gist.github.com

パラメータの事後分布

f:id:katsumushi:20180604001602p:plain

f:id:katsumushi:20180604001617p:plain

プロット

以下のプロット関数を実行すると、次の図*1が表示されます。身体が大きくなると、Lサイズを選ぶ確率が上がり、9 cmあたりを超えると、どれも選ばなくなっているのが分かります。

      
        library(tidyverse)
      
        library(ggrepel)
      
        library(rstan)
      
        expose_stan_functions("my_stan_functions.stan") # https://gist.github.com/kagaya/c726f16d82b80026bb1924b408a72b5c
      
        source("utility.R") # https://gist.github.com/kagaya/60c4190ac306840daee54115b3c315c3
      
        plot_choice_random_intercept <- function(d, fit){
      
          # mu[n,2] =  bias_l[ID[n]] + cwl*C_width[n] + ll*Leg_lack[n];
      
          # mu[n,3] =  bias_no0 + cwno*C_width[n] + lno*Leg_lack[n]; 
      
          ms <- rstan::extract(fit)
      
          new_C_width <- seq(min(d$shell_width), max(d$shell_width), length.out=500)
      
          ## 50 percentile prediction
      
          bias_l0_50 <- quantile(ms$bias_l0, 0.5)
      
          cw_l_50 <- quantile(ms$cwl, 0.5)
      
          bias_no0_50 <- quantile(ms$bias_no0, 0.5)
      
          cw_no_50 <- quantile(ms$cwno, 0.5)
      
          mu2_50 <- c() 
      
          mu3_50 <- c()
      
          theta2_50 <- c()
      
          theta3_50 <- c()
      
          choice_rng <- matrix(nrow=500, ncol=500)
      
          for (i in 1:500){
      
            mu2_50[i] <- bias_l0_50 + cw_l_50*new_C_width[i]
      
            mu3_50[i] <- bias_no0_50 + cw_no_50*new_C_width[i]
      
            theta2_50[i] <- 1 + exp(mu2_50[i])/(1 + exp(mu2_50[i]) + exp(mu3_50[i]))
      
            theta3_50[i] <- 1 + 2 * exp(mu3_50[i])/(1 + exp(mu2_50[i]) + exp(mu3_50[i])) # to overlay choice plot
      
            choice_rng[,i] <- my_categorical_logit_rng(500, c(0, mu2_50[i], mu3_50[i]))
      
          }
      
          colnames(choice_rng) <- new_C_width
      
          choice_rng <- choice_rng %>% as_tibble() %>% gather(x,y) %>% mutate(x=as.numeric(x))
      
          pred_50 <- tibble(carapace_width=new_C_width, 
      
                            theta2_50=theta2_50,
      
                            theta3_50=theta3_50)
      
          gid <- levels(d$id)[table(d$id) > 1]
      
          mul <- factor(rep("s", dim(d)[1]), levels=c("g", "s"))
      
          for (i in 1:dim(d)[1]){
      
            if (d$id[i] %in% gid){
      
              mul[i] <- c("g")
      
            }
      
          }
      
          d$mul <- mul
      
          dd <- d
      
          dd$choice <- as.integer(d$choice)
      
          d_seg <- dd %>%
      
            group_by(id) %>%
      
            summarise( choice_min=min(choice),
      
                       choice_max=max(choice),
      
                       shell_width=first(shell_width)) %>%
      
            drop_na()
      
          d <- add_chosen_col_choice(d)
      
          plt <- ggplot(d, aes(shell_width, choice)) +
      
            stat_density2d(geom="raster", aes(x=x, y=y, fill=..density..), contour=F, data=choice_rng) +
      
            geom_line(aes(carapace_width, theta2_50), alpha=0.2, data=pred_50) +
      
            geom_line(aes(carapace_width, theta3_50), alpha=0.2, data=pred_50) +
      
            geom_point(aes(size=chosen_num), pch=0, alpha=0.8) +
      
            geom_segment(
      
              data = d_seg,
      
              alpha = 0.5,
      
              lty=3,
      
              aes(x = shell_width,
      
                  y = choice_min,
      
                  xend = shell_width,
      
                  yend = choice_max)) +
      
            geom_text_repel(aes(shell_width, choice, label=id), alpha=0.2, size=3) +
      
            scale_shape_manual(values=c(1,3)) +
      
            scale_fill_gradient(low="white", high="gray") +
      
            ylim(c(0.8, 3.2)) +
      
            xlab("carapace width (cm)") +
      
            ylab("behavioral choice") +
      
            theme_classic()
      
          return(plt)
      
        }

view raw plot_choice.R hosted with ❤ by GitHub

gist.github.com

パラメータの50パーセンタイルを用いて乱数を発生させて、それを stat_density2d で重ねて、濃いところでデータ発生確率が高い様子を表現しました。my_categorical_logit_rng 関数は、次を rstan::expose_stan_functions 関数で読み込んでから実行してください。

https://gist.github.com/kagaya/c726f16d82b80026bb1924b408a72b5c

データ点が濃いところで出ているようなので、モデルの予測がうまく行っていると言ってよいかと思います。また、個体38は小さいにもかかわらず、Lサイズを好んで選ぶような個性が出ているのですが、階層モデルがこれには過剰に適合していないように見えます。

WAIC の計算

      
        waic <- function(log_likelihood) {
      
          # from https://gist.github.com/MatsuuraKentaro/3f6ae5863e700f5039c19e36a9bdf646
      
          training_error <- - mean(log(colMeans(exp(log_likelihood))))
      
          functional_variance_div_N <- mean(colMeans(log_likelihood^2) - colMeans(log_likelihood)^2)
      
          waic <- training_error + functional_variance_div_N
      
          return(waic)
      
        }

view raw waic.R hosted with ❤ by GitHub

gist.github.com

で計算できます。ここではひとつだけなので、比較はしませんが、詳しくはプレプリントを御参照ください。

*1:正確にはその原図

シャコ・エビ日記

シャコパンチ、エビパチン研究者の記録

カイカムリのスポンジ選択で学ぶ階層ベイズモデリング

データ

モデル構成

Stanコード

Stanを実行するRコード

パラメータの事後分布

プロット

WAIC の計算

	functions {
	real f(real mu, real s, real x){
	return(normal_lpdf(x \| mu, s));
	}

	real log_lik_Simpson(real mu, real s, real a, real b, int M) {
	vector[M+1] lp;
	real h;
	h = (b-a)/M;
	lp[1] = f(mu, s, a);
	for (m in 1:(M/2))
	lp[2m] = log(4) + f(mu, s, a + h(2*m-1));
	for (m in 1:(M/2-1))
	lp[2m+1] = log(2) + f(mu, s, a + h2*m);
	lp[M+1] = f(mu, s, b);
	return(log(h/3) + log_sum_exp(lp));
	}

	real f2(int Y, real cwl, real cwno, real ll, real lno, real C_width, real Leg_lack,
	real bias_no0,
	real bias_l_K){
	vector[3] mu;
	mu[1] = 0;
	mu[2] = bias_l_K + cwlC_width + llLeg_lack;
	mu[3] = bias_no0 + cwnoC_width + lnoLeg_lack;
	return(categorical_logit_lpmf(Y \| mu));
	}


	real log_lik_Simpson_rest(int Y, real cwl, real cwno, real ll, real lno, real C_width, real Leg_lack,
	real bias_no0,
	real bias_l_lower, real bias_l_upper,
	int M){
	vector[M+1] lp;
	real h;
	h = (bias_l_upper - bias_l_lower)/M;
	lp[1] = f2(Y, cwl, cwno, ll, lno, C_width, Leg_lack,
	bias_no0,
	bias_l_lower);
	for (m in 1:(M/2))
	lp[2*m] = log(4) + f2(Y, cwl, cwno, ll, lno, C_width, Leg_lack,
	bias_no0,
	bias_l_lower + h(2m-1));
	for (m in 1:(M/2-1))
	lp[2*m+1] = log(2) + f2(Y, cwl, cwno, ll, lno, C_width, Leg_lack,
	bias_no0,
	bias_l_lower + h2m);
	lp[M+1] = f2(Y, cwl, cwno, ll, lno, C_width, Leg_lack,
	bias_no0,
	bias_l_upper);
	return(log(h/3) + log_sum_exp(lp));
	}
	}

	data {
	int N;
	int K;
	int L;
	int<lower=1, upper=K> Y[N];
	real C_width[N];
	int Leg_lack[N];
	int<lower=1, upper=L> ID[N];
	}

	parameters {
	real bias_l[L];
	real bias_l0;
	real ll;
	real cwl;
	real<lower=0> s_bias_l;

	real cwno;
	real bias_no0;
	real lno;
	}

	transformed parameters {
	matrix[N,K] mu;

	for (n in 1:N){
	mu[n,1] = 0;
	mu[n,2] = bias_l[ID[n]] + cwlC_width[n] + llLeg_lack[n];
	mu[n,3] = bias_no0 + cwnoC_width[n] + lnoLeg_lack[n];
	}
	}

	model {
	for (l in 1:L){
	bias_l[l] ~ normal(bias_l0, s_bias_l);
	}
	for (n in 1:N){
	Y[n] ~ categorical_logit(mu[n,]');
	}
	}

	generated quantities {
	vector[N] log_lik;
	real log_lik_l;
	// for making computing fast, separately compute loglik unrelated to Y[n] and other parameters
	log_lik_l = log_lik_Simpson(bias_l0, s_bias_l,
	bias_l0 - 5s_bias_l, bias_l0 + 5s_bias_l, 100);
	for (n in 1:N){
	log_lik[n] = log_lik_l +
	log_lik_Simpson_rest(Y[n],
	cwl, cwno, ll, lno, C_width[n], Leg_lack[n],
	bias_no0,
	bias_l0 - 5s_bias_l, bias_l0 + 5s_bias_l,
	100);
	}
	}

	library(rstan)
	library(tidyverse)
	library(shinystan)

	rstan_options(auto_write = TRUE)
	options(mc.cores = parallel::detectCores())

	# preprocessing
	d <- read_csv(file='Dromiidae.csv') # https://gist.github.com/kagaya/0d309397300b7e8294fe53c44b6fc525
	d <- d %>%
	select(c('shell_width', 'choice',
	#'whole_px', 'cm_per_px',
	'id', 'leg_lack' ))

	d <- d[complete.cases(d), ]


	# renumber animal id from 1
	id <- as.factor(d$id)
	levels(id) <- as.character(1:length(levels(id)))
	d$id <- as.integer(id)

	d$choice <- as.factor(d$choice)
	levels(d$choice) <- c(2,1,3) # label M,L,No, as 1,2,3
	d$choice <- as.integer(as.character(d$choice))

	d$leg_lack <- as.factor(d$leg_lack)
	levels(d$leg_lack) <- c(2,3,1) # label no_leg_lack, A, C, as 1, 2, 3
	d$leg_lack <- as.integer(as.character(d$leg_lack))

	data <- list(N=nrow(d),
	K=max(d$choice),
	L=max(d$id),
	Y=d$choice,
	#cap_ex_size=d$cap_ex_size,
	C_width=d$shell_width,
	Leg_lack=d$leg_lack,
	ID=d$id)

	fit <- stan(file='model_1_1_choice.stan', ###
	data=data, seed=1234,
	chains=4, iter=6000, warmup=1000, thin=1)

	library(tidyverse)
	library(ggrepel)
	library(rstan)
	expose_stan_functions("my_stan_functions.stan") # https://gist.github.com/kagaya/c726f16d82b80026bb1924b408a72b5c
	source("utility.R") # https://gist.github.com/kagaya/60c4190ac306840daee54115b3c315c3

	plot_choice_random_intercept <- function(d, fit){
	# mu[n,2] = bias_l[ID[n]] + cwlC_width[n] + llLeg_lack[n];
	# mu[n,3] = bias_no0 + cwnoC_width[n] + lnoLeg_lack[n];
	ms <- rstan::extract(fit)
	new_C_width <- seq(min(d$shell_width), max(d$shell_width), length.out=500)

	## 50 percentile prediction
	bias_l0_50 <- quantile(ms$bias_l0, 0.5)
	cw_l_50 <- quantile(ms$cwl, 0.5)
	bias_no0_50 <- quantile(ms$bias_no0, 0.5)
	cw_no_50 <- quantile(ms$cwno, 0.5)

	mu2_50 <- c()
	mu3_50 <- c()
	theta2_50 <- c()
	theta3_50 <- c()

	choice_rng <- matrix(nrow=500, ncol=500)
	for (i in 1:500){
	mu2_50[i] <- bias_l0_50 + cw_l_50*new_C_width[i]
	mu3_50[i] <- bias_no0_50 + cw_no_50*new_C_width[i]
	theta2_50[i] <- 1 + exp(mu2_50[i])/(1 + exp(mu2_50[i]) + exp(mu3_50[i]))
	theta3_50[i] <- 1 + 2 * exp(mu3_50[i])/(1 + exp(mu2_50[i]) + exp(mu3_50[i])) # to overlay choice plot

	choice_rng[,i] <- my_categorical_logit_rng(500, c(0, mu2_50[i], mu3_50[i]))
	}

	colnames(choice_rng) <- new_C_width
	choice_rng <- choice_rng %>% as_tibble() %>% gather(x,y) %>% mutate(x=as.numeric(x))

	pred_50 <- tibble(carapace_width=new_C_width,
	theta2_50=theta2_50,
	theta3_50=theta3_50)


	gid <- levels(d$id)[table(d$id) > 1]

	mul <- factor(rep("s", dim(d)[1]), levels=c("g", "s"))
	for (i in 1:dim(d)[1]){
	if (d$id[i] %in% gid){
	mul[i] <- c("g")
	}
	}

	d$mul <- mul

	dd <- d
	dd$choice <- as.integer(d$choice)
	d_seg <- dd %>%
	group_by(id) %>%
	summarise( choice_min=min(choice),
	choice_max=max(choice),
	shell_width=first(shell_width)) %>%
	drop_na()

	d <- add_chosen_col_choice(d)

	plt <- ggplot(d, aes(shell_width, choice)) +
	stat_density2d(geom="raster", aes(x=x, y=y, fill=..density..), contour=F, data=choice_rng) +
	geom_line(aes(carapace_width, theta2_50), alpha=0.2, data=pred_50) +
	geom_line(aes(carapace_width, theta3_50), alpha=0.2, data=pred_50) +
	geom_point(aes(size=chosen_num), pch=0, alpha=0.8) +
	geom_segment(
	data = d_seg,
	alpha = 0.5,
	lty=3,
	aes(x = shell_width,
	y = choice_min,
	xend = shell_width,
	yend = choice_max)) +
	geom_text_repel(aes(shell_width, choice, label=id), alpha=0.2, size=3) +
	scale_shape_manual(values=c(1,3)) +
	scale_fill_gradient(low="white", high="gray") +
	ylim(c(0.8, 3.2)) +
	xlab("carapace width (cm)") +
	ylab("behavioral choice") +
	theme_classic()
	return(plt)
	}

	waic <- function(log_likelihood) {
	# from https://gist.github.com/MatsuuraKentaro/3f6ae5863e700f5039c19e36a9bdf646
	training_error <- - mean(log(colMeans(exp(log_likelihood))))
	functional_variance_div_N <- mean(colMeans(log_likelihood^2) - colMeans(log_likelihood)^2)
	waic <- training_error + functional_variance_div_N
	return(waic)
	}