R Code: [1.1] (note that the ANES data is pre-processed by us dropping missing observations and all that sort of thing; if you want to replicate, I’ll give you the files to do this). library(lme4) ANES76 <- read.table('R//TTS//76.dat', header = T, sep ='\t') summary(lm(con2 ~ edu1976 + gender, data = ANES76)) summary(glm(con2 ~ edu1976 + gender, family = binomial, data = ANES76)) summary(glmer(con2 ~ edu1976 + gender + (1|dis1976), family = binomial, data = ANES76)) summary(lm(con2 ~ edu1976 + gender + inc1976, data = ANES76)) summary(glmer(con2 ~ edu1976 + gender + inc1976 + (1|dis1976), family = binomial, data = ANES76)) R-code [2.1]: library(lattice) # Chap 2, to make figs 2.1, 2.2 d <- matrix(c(11, 6,10,10,59,28,10,11,7,13,12,10,5,8,7,68,22,12,6,8,10,8,6,5,8,6,66,71,7,8,7,6,10,7, 11,11,11,54,24,12,7,7,11,8,7,13,13,5,27,54,9,9,5,5,5, 8,5,10,13,54,84,8,12,7,10,6, 9,12,7,9,13,25,7,6,11,10,11, 9,10,8,11,40,18,12,6,7,7,10,12,9,12,12,26,77,9,7,10,6,10,5,12,10,8,41,36,7,5,13,13,5,10,7,9,5,35,68,10,10,11,6,5), nrow = 11, ncol = 11) colnames(d) <- seq(from = 100, to = 0, by = - 10) rownames(d) <- seq(from = 1900, to = 2000, by = 10) mylevels <- seq(7500,0,-500) hd <- dim(d)[1] vd <- dim(d)[2] fd <- cbind(d[,vd:1]) tiff('F2_1.tiff', height = 12, width = 17, units = 'cm', compression = "lzw", res = 300) levelplot(as.matrix(fd), aspect = 'fill', col.regions = gray(100:0/100), xlim = rownames(d), ylim = colnames(d)[11:1], xlab = list('year', cex = 1, font = 1), cex.axis = .75, ylab = list('age', cex = 1, font = 1), cex = 3, scales=list(draw=TRUE, x=list(cex=.5),y=list(cex=.75)), main = list('Period Effect', cex = 2), par.settings=list(fontsize=list(text=20))) dev.off() R-code [2.2]: library(lattice) d2 <- matrix(c(13,6,7,9,5,13,6,20,8,11,11,5,10,7,9,8,6,20,5,6,9,7,12,7,10,6,9,23,11,6,6,9,9,13,7,8,12,18,13,8,7,8,11,15,12,11,9,18,7,9,8,12,12,8,11,5,11,22,6,13,12,9,9,23,11,7,10,9,12,10,10,11,6,22,12,9,9,11,13,11,10,10,9,11,6,7,11,12,6,12,9,11,5,17,7,13,11,6,9,7,11,8,10,15,9,9,6,13,10,8,6,10,6,16,8,13,7,5,10,7,11), nrow = 11, ncol = 11) mylevels2 <- seq(8500,0,-500) colnames(d2) <- seq(from = 100, to = 0, by = - 10) rownames(d2) <- seq(from = 1900, to = 2000, by = 10) hd2 <- dim(d2)[1] vd2 <- dim(d2)[2] fd2 <- cbind(d2[,vd2:1]) tiff('F2_2.tiff', height = 12, width = 17, units = 'cm', compression = "lzw", res = 300) levelplot(as.matrix(fd2), aspect = 'fill', col.regions = gray(100:0/100), xlim = rownames(d2), ylim = colnames(d2)[11:1], xlab = list('year', cex = 1, font = 1), cex.axis = .75, ylab = list('age', cex = 1, font = 1), cex = 3, scales=list(draw=TRUE, x=list(cex=.5),y=list(cex=.75)), main = list('Cohort Effect', cex = 2), par.settings=list(fontsize=list(text=20))) dev.off() R-code [2.3]: install.packages('mosaic') # this is for the ntiles command library(mosaic) # this is a convenience routine to get the range for a plot with two different y variables yrng <- function(smthng, fixed, fmn = 0, fmx = 1){ numn <- (min(smthng))/2 numx <- max(smthng) if (fixed == 'T'){ yrng.mn <- min(fmn, numn) yrng.mx <- max(fmx, numx) }else{ yrng.mn <- numn yrng.mx <- numx } return(list(yrng.mn, yrng.mx)) } x <- runif(3000, -5, 5) e <- rnorm(3000, 0, 2) y <- x + e ncut <- 6 z <- ntiles(y, n = ncut, format = "rank", digits = 1) w <- ntiles(x, n = ncut, format = "rank", digits = 1) df1 <- data.frame(x = x, y = y, z = as.numeric(z), w = as.numeric(w)) bz <- rep(0, ncut) bw <- rep(0, ncut) for (i in 1:ncut){ dfz <- df1[df1$z == i,] dfw <- df1[df1$w == i,] m1 <- lm(dfz$y ~ dfz$x) bz[i] <- m1$coefficients[2] m1 <- lm(dfw$z ~ dfw$x) bw[i] <- m1$coefficients[2] } yl <- unlist(yrng(c(bw, bz), fixed = F, fmn = 0, fmx = 1)) plot(x = 1:ncut, y = bw, type = 'b', ylim = yl, ylab = 'slope', xlab = 'bin') lines(x = 1:ncut, y = bz, type = 'b', lty = 2, pch = 2) legend('center', legend = c('independent', 'dependent'), lty=c(1,2), pch = c(1,2), bty='n', cex=.75) [2.4] R-code: N <- 1000 x <- rnorm(N,8, 2) x[x<1] <- 1 x[x>10] <- 10 x2 <- x*x y <- (exp(x/1.4)+400)/20 + rnorm(N,0,5) m1 <- lm(y ~ x + x2) x_f <- seq(1, 10, .1) pr1 <- x_f*m1$coefficients[2] + x_f*x_f*m1$coefficients[3] + m1$coefficients[1] # this shows how the quadratic is misleading; not printed in book plot(x = x, y = y) lines(x = x_f, y = pr1, lty= 2) # here's the scatter with marginal install.packages('psych') library(psych) scatter.hist(x = x,y=y,smooth=TRUE,ab=FALSE,correl=TRUE,density=TRUE,ellipse=F, digits=2, cex.cor=1,title="Scatter plot + histograms",xlab='x', ylab='y') [3.1] R-code: ## Note: this replaces the code that originally corresponded to note 6 in Chapter 3. ## ## Explanation: in the interests of condensing an explanation of the logic that would fit into the note, I had ## made a version of this simulation that _basically_ made the data as I was describing it, but took shortcuts. ## Yong Gao realized that the way I had set it up was unstable -- while most of the time it behaved as I described, ## in other cases, it did the exact opposite! This version takes more time to describe, but it fits the logic that ## was given in the note. ## ## There are some deliberate redundancies here (sometimes using the odds ratio and sometimes the coefficient from ## a logistic regression) in case you are interested in changing the model N <- 1000 risk_prop_err <- 5 # distribution on risk taking dev_prop_err <- 3 # unexplained variance in being a devil given risk taking preg_prop_mean <- -3 # mean probability of becoming pregnant, all else equal preg_prop_err <- 1 # unexplained variance in becoming pregnant, all else equal Preg_rep_err <- .01 # reporting error for becoming pregnant drop_prop_mean <- -1 # mean probability of dropping out, all else equal drop_prop_err <- 2 # unexplained variance in dropping out, all else equal DO_rep_err <- .01 # reporting error for dropping out DO_slope <- 3 # strength of relation bteween becoming pregnant and dropping out Nsim <- 100 # number of simulations to do # convenience function odds_ratio <- function(table1){ num1 <- table1[1,1]*table1[2,2] den1 <- table1[1,2]*table1[2,1] if(den1 == 0){ print('error in odds ratio') stop() } return(num1/den1) } # given the parameters, this makes the difference in odds ratio coefficients for the observed data; if verbose is true, # it gives lots of info, otherwise, for embedded-in-loop uses, it just returns the difference main_routine <- function(N, risk_prop_err, dev_prop_err, preg_prop_mean, preg_prop_err, Preg_rep_err, drop_prop_mean, drop_prop_err, DO_rep_err, DO_slope, Verbose = T){ Risk_taking <- rnorm(N,0,risk_prop_err) Is_Devil_propensity <- Risk_taking + rnorm(N,0,dev_prop_err) Is_Devil <- as.integer(Is_Devil_propensity>0) Pregnancy_propensity <- Risk_taking + rnorm(N,preg_prop_mean,preg_prop_err) Pregnant <- Pregnancy_propensity>0 Measured_preg <- Pregnant # to make our error filled measure, we first assign all true values Preg_switchers <- which(as.logical(rbinom(N,1,Preg_rep_err))) # and then randomly reverse some Measured_preg[Preg_switchers] <- 1 - Measured_preg[Preg_switchers] Drop_out_propensity <- DO_slope*Pregnant + rnorm(N,drop_prop_mean,drop_prop_err) Drop_out <- Drop_out_propensity>0 Measured_Drop_out <- Drop_out # reverse some proportion due to error DO_switchers <- which(as.logical(rbinom(N,1,DO_rep_err))) Measured_Drop_out[DO_switchers] <- 1 - Measured_Drop_out[DO_switchers] angel_mod <- summary(glm(Measured_Drop_out[Is_Devil == 0] ~ Measured_preg[Is_Devil == 0], binomial)) devil_mod <- summary(glm(Measured_Drop_out[Is_Devil == 1] ~ Measured_preg[Is_Devil == 1], binomial)) angel_coef <- angel_mod$coefficients[2,1] devil_coef <- devil_mod$coefficients[2,1] if(Verbose){ summary(lm(Is_Devil_propensity ~ Risk_taking)) summary(lm(Pregnancy_propensity ~ Risk_taking)) summary(lm(Drop_out_propensity ~ Pregnant)) summary(glm(Drop_out ~ Pregnant, binomial)) summary(glm(Drop_out ~ Is_Devil, binomial)) summary(glm(Measured_Drop_out ~ Measured_preg, binomial)) summary(glm(Measured_Drop_out ~ Is_Devil, binomial)) print(paste0('difference of devil (',format(devil_coef, digits = 3),') to angel (', format(angel_coef, digits = 3),') coefficients: ',format(devil_coef-angel_coef, digits = 3))) print(table(Pregnant, Is_Devil)) print(table(Drop_out, Is_Devil)) print(table(Drop_out, Pregnant)) print(table(Drop_out, Pregnant, Is_Devil)) print(table(Measured_preg, Is_Devil)) print(table(Measured_Drop_out, Is_Devil)) print(table(Measured_Drop_out, Measured_preg)) print(table(Measured_Drop_out, Measured_preg, Is_Devil)) OR_angel <- odds_ratio(table(Measured_Drop_out[Is_Devil == 0], Measured_preg[Is_Devil == 0])) OR_devil <- odds_ratio(table(Measured_Drop_out[Is_Devil == 1], Measured_preg[Is_Devil == 1])) print(paste0('odds_ratios between observed dropping out and pregancy are ',format(OR_angel, digits = 3),' for angels and ', format(OR_devil, digits = 3),' for devils.')) }else{ return(devil_coef-angel_coef) } } outer_loop <- function(N, risk_prop_err, dev_prop_err, preg_prop_mean, preg_prop_err,Preg_rep_err, drop_prop_mean, drop_prop_err, DO_rep_err, DO_slope, Nsim){ res_list <- rep(NA, Nsim) for(i in 1:Nsim){ res_list[i] <- main_routine(N, risk_prop_err, dev_prop_err, preg_prop_mean, preg_prop_err, Preg_rep_err, drop_prop_mean, drop_prop_err, DO_rep_err, DO_slope, Verbose = F) } plot(density(res_list[!is.na(res_list)]), main = 'distribution of difference (devils - angels) in odds-ratios, logged metric') } # here is an example main_routine(N, risk_prop_err, dev_prop_err, preg_prop_mean, preg_prop_err, Preg_rep_err, drop_prop_mean, drop_prop_err, DO_rep_err, DO_slope, Verbose = T) # let's look at a distribution of Nsim trials; the return is the excess association among the devils outer_loop(N, risk_prop_err, dev_prop_err, preg_prop_mean, preg_prop_err, Preg_rep_err, drop_prop_mean, drop_prop_err, DO_rep_err, DO_slope, Nsim) R-code: [4.1] z <- runif(1000,0,1) e1 <- rnorm(1000,0,.2) e2 <- rnorm(1000,0,.2) e3 <- rnorm(1000,0,.284) x <- .5*z + e1 y <- .5*x + .5*z + e2 summary(lm(y ~ x + z)) summary(lm(y ~ x)) nslice <- 100 Bs <- vector(length = nslice+1) for (i in 0:nslice){ pe <- i/nslice zo <- pe*e3 + (1-pe)*z Bs[i] <- summary(lm(y ~ x + zo))$coefficients[2] } xx = 0:nslice plot(x = xx, xlim = c(0,nslice), ylim = c(.5,1), main = 'imperfect observation', xlab = 'degree of error', y = Bs, ylab ='B(x)', pch = 18, cex = 2) [4.2] z <- runif(1000,0,1) e1 <- rnorm(1000,0,.2) e2 <- rnorm(1000,0,.2) x <- 1*z + e1 y <- .5*x + 1*z + e2 summary(lm(y ~ x + z)) Bs <- vector(length = 10) j <- 0 for (i in 11:2){ z1 <- cut(z,i, include.lowest = TRUE, labels = 1:i) z2 <- as.numeric(as.character(z1)) j <- j+1 Bs[j] <- summary(lm(y ~ x + z2))$coefficients[2] } plot(x = 11:2, xlim = c(11,2), main = 'granulated', xlab = 'number of bins', y = Bs, ylab ='b(x)', pch = 18, cex = 2) xx = 2:11 loess_fit <- loess(Bs ~ xx, span = .8) lines(predict(loess_fit)[11:1], lwd = 3) [4.3] library(lattice) z <- runif(1000,0,1) e1 <- rnorm(1000,0,.2) e2 <- rnorm(1000,0,.2) e3 <- rnorm(1000,0,.284) x <- z + e1 y <- .5*x + z + e2 summary(lm(y ~ x + z)) nslice <- 100 nbin <-11 Bs <- matrix(data = NA, nrow = nslice, ncol = nbin) j <- 0 for (i in nbin:2){ for (k in 1:nslice){ pe <- k/nslice zo <- pe*e3 + (1-pe)*z z1 <- cut(zo,i, include.lowest = TRUE, labels = 1:i) zoc <- as.numeric(as.character(z1)) Bs[k,i] <- summary(lm(y ~ x + zoc))$coefficients[2] } j <- j+1 } levelplot(as.matrix(Bs), aspect = 'fill', col.regions = gray(0:100/100), xlim = c(1,nslice), ylim = c(nbin, 2), xlab = list('degree of error', cex = 1, font = 1), ylab = list('number of categories', cex = 1, font = 1), cex = 1.5, scales=list(draw=FALSE), main = list('collapsing and error', cex = 2), par.settings=list(fontsize=list(text=20))) [4.4] R-Code: N <- 1500 x1 <- runif(N, 0, 100) x2 <- runif(N, 0, 100) x1_bar <- mean(x1) x2_bar <- mean(x2) x1n <- x1 - x1_bar x2n <- x2 - x2_bar e1 <- 75 e <- rnorm(N, 0, e1) y1 <- .5*x1 + .5*x2 + .05*x1*x2 - 200 y2 <- .5*x1n + .5*x2n + .05*x1n*x2n plot(x = y1, y = y2) hist(y1) min(y1) mean(y1) [4.5] R-Code: # x1 is equivalent to genes; # x2 something correlated with genes, such as biodescent; # x3 is something that, in correlation with x2, predicts y (e.g., environmment) make_data2 <- function (N, xmax, e12, b12, b1, b2, b3, bint, ey, nbin){ x1 <- runif(N, -xmax, xmax) ee1 <- rnorm(N, 0, e12) x2 <- b12*x1 + ee1 # to make the two correlated x3 <- runif(N, -xmax, xmax) ee2 <- rnorm(N, 0, ey) y <- b1*x1 + b2*x2 + b3*x3 + bint*x2*x3 + ee2 z <- as.numeric(y > 0) bins <- cut(x1, nbin, include.lowest = TRUE, labels = 1:nbin) bin_ref <- as.numeric(as.character(bins)) df2 <- data.frame(x1 = x1, x2 = x2, x3 = x3, bin_ref = bin_ref, y = y, z = z) return(df2) } N <- 300 xmax <- 1 e12 <- 1 # controls correlation between x1 and x2 b12 <- 1 # b1 <- 0 # no effect of genes b2 <- 1 # b3 <- 1 # . bint <- 2 # this is the x2-x3 effect ey <- 1 nbin <- 3 # number of bins of predictors df2 <- make_data2(N, xmax, e12, b12, b1, b2, b3, bint, ey, nbin) print('first, true model') summary(lm(y ~ x1 + x2 + x3 + x2:x3+ x1:x3, data = df2)) print('second, model with wrong interaction') summary(lm(y ~ x1 + x2 + x3 + x1:x3, data = df2)) print('breaking into bins by x1, even controlling for x2, gives wrong results') for (i in 1:nbin){ dfi <- subset(df2, df2$bin_ref==i) print(summary(lm(y ~ x2 + x3, data = dfi))) } [5.1] R-code: # first example (shown) NS <- 50 # number schools N <- 500 # number individuals Sc <- rep(1:NS, each = N/NS) Scf <- as.factor(Sc) # school each individual is in Cl <- runif(NS, 0, 10) # school values seq1 <- seq(0, 30) Vlen <- length(seq1) Res1 <- rep(0, Vlen) ptr <- 0 for (i in seq1){ ptr <- ptr + 1 Rep <- Cl[Sc] + rnorm(N, 0, sqrt(i)) # the individual responses A1 <- unlist(summary(aov(Rep~Scf))) # the anove Res1[ptr] <- A1[3]/(A1[3] + A1[4]) } plot(x = seq1, y = Res1, xlab = 'Error Variance', ylab = 'Proportion Between Groups') i <- 2 # example used in text Rep <- Cl[Sc] + rnorm(N, 0, sqrt(i)) A1 <- unlist(summary(aov(Rep~Scf))) A1[3]/(A1[3] + A1[4]) boxplot(Rep ~ Sc, xlab = 'Group', xaxt = 'n') # this shows relation of proportion explained to number of individuals in # the higher level units. Cl <- runif(NS, 0, 10) seq2 <- seq(1, 50) Vlen2 <- length(seq2) Res2 <- rep(0, Vlen2) ptr <- 0 for (j in seq2){ ptr <- ptr + 1 N2 <- NS*j Sc <- rep(1:NS, each = j) Scf <- as.factor(Sc) Rep <- Cl[Sc] + rnorm(N2, 0, sqrt(i)) A2 <- unlist(summary(aov(Rep~Scf))) Res2[ptr] <- A2[3]/(A2[3] + A2[4]) } plot(x = seq2, y = Res2, xlab = 'Number students per School', ylab = 'Proportion Between Groups') [5.2] R code: # first showing effect of variance reduction on R-sq N <- 1500 evar <- 100 b <- 1 x <- runif(N, 1, 100) e <- rnorm(N,0, evar) y <- b*x + e A1 <- data.frame(y = y, x = x) line1 <- lm(y ~ x) r2 <- summary(line1)$r.squared plot(x = x, y = y, xlab = 'X', ylab = 'Y', main = 'raw data') abline(line1) legend(x = 'topleft', legend = paste0('R-sq: ', format(r2, digits=2))) crunch <- function (A, j) { c <- mean(A$y[A$z == j]) return(c) } j <- 0 top <- 100 bot <- 5 Rs <- vector(mode = 'integer', length = top - bot + 1) for (i in top:bot){ A1$z <- as.numeric(as.character(cut(x, i, include.lowest = TRUE, labels = 1:i))) y1 <- sapply(1:i, crunch, A = A1) x1 <- 1:i j <- j+1 Rs[j] <- summary(lm(y1 ~ x1))$r.squared if(i %% 20 == 0){ plot(x = x1, y = y1, xlab = 'X', ylab = 'Y', main = paste0(i,' bins')) line1 <- lm(y1 ~ x1) r2 <- summary(line1)$r.squared legend(x = 'topleft', legend = paste0('R-sq: ', format(r2, digits=2))) } } plot(x = top:bot, y = Rs, xlim = c(top,bot), xlab = 'number of bins', main = 'Fit and Binning' ) # now adding w to things ec <- 50 w <- runif(N, 1, 100) e1 <- rnorm(N,0, ec) e2 <- rnorm(N,0, ec) x <- w + e1 y <- w + e2 + x # note change above to y <- w + e2 - x for final plot A2 <- data.frame(y = y, x = x, w = w) line1 <- lm(A2$y ~ A2$x) r2 <- summary(line1)$r.squared legend(x = 'topleft', legend = paste0('R-sq: ', format(r2, digits=2))) plot(x = A2$x, y = A2$y, xlab = 'X', ylab = 'Y', main = ' original ') summary(lm(A2$y ~ A2$x)) [5.3] R-code: # here I assume the two independent variables correspond to Period and Cohort b1 <- 1:100 # breaks to be used in turning continuous into table Cbs <- runif(100,-2,2) # these are the true values of cohort /period effects Pbs <- runif(100,-2,2) # across all values of both startv <- 10 stopv <- 90 lenv <- stopv - startv + 1 res1 <- rep(0,lenv) for (i in startv:stopv){ N <- 1000 Cohort <- runif(N, i, 100) Period <- runif(N, 100-i, 100) C1 <- cut(Cohort, breaks = b1, include.lowest = T) # turn to factors P1 <- cut(Period, breaks = b1, include.lowest = T) # like APC table C2 <- as.numeric(C1) # need this version for computation below P2 <- as.numeric(P1) e1 <- rnorm(N, 0, 50) Y1 <- rep(N,0) # Awkward but I am out of internet range with no manuals for (j in 1:N){ Y1[j] = Cbs[C2[j]] + Pbs[P2[j]] + e1[j] } L1 <- lm(Y1 ~ C1 + P1) A1 <- anova(L1) CohortSS <- A1$`Sum Sq`[1] PeriodSS <- A1$`Sum Sq`[2] res1[i - startv + 1] <- CohortSS/(PeriodSS+CohortSS) } plot(x = startv:stopv, y = res1, type = 's', xlab = 'skew', ylab = 'proportion explained') [5.4] R-code: # this makes a data frame with a dichotomous dependent variable given our two predictors' betas and the residual error make_data <- function (N, xmax, e1, b1, b2, nbin){ x1 <- runif(N, -xmax, xmax) x2 <- runif(N, -xmax, xmax) ee <- rnorm(N, 0, e1) y <- b1*x1 + b2*x2 + ee z <- as.numeric(y > 0) y_hat1 <- b1*x1 # predictions based on first variable. bins <- cut(y_hat1, nbin, include.lowest = TRUE, labels = 1:nbin) bin_ref <- as.numeric(as.character(bins)) df1 <- data.frame(x1 = x1, x2 = x2, bin_ref = bin_ref, y = y, z = z) return(df1) } # this regresses the dichotomy on b1 within a certain number of bins # and gives the p-value within each bin make_p_vals <- function (df1, nbin){ b_x2 <- rep(NA, nbin) p_x2 <- rep(NA, nbin) for (i in 1:nbin){ df2 <- subset(df1, bin_ref == i) fit <- glm(z ~ x2, data = df2, family = "binomial") b_x2[i] <- summary(fit)$coefficients[2] # in case you want these... p_x2[i] <- coef(summary(fit))[2,4] } return(p_x2) } # this plots the data as follows: we have a scatterplot of the strong # (horizontal) and weak (vertical) IVs; # the points are colored by their value on the dichotomy, # red is 0, green is 1. Then, the p-value of the regression # in each bin is placed on the 0 - 1 scale as a black rectangle. pretty_plot <- function (df1, p_x2){ nbin <- length(p_x2) N <- nrow(df1) plot(y = p_x2, x = 1:nbin) cc <- vector(length = N) for (i in 1:N){ if (df1$z[i]==0) { cc[i] = 'red' }else{ cc[i] = 'green' } } par(mar=c(5, 4, 4, 6) + 0.1) plot(x = df1$x1, y= df1$x2, pch = 19,col = cc, axes = FALSE, ylim = c(-1,1), main = 'example results') axis(side=2) axis(side=1) par(new = TRUE) plot(x = 1:nbin, y = p_x2, pch = 15, bg = 'black', cex = 2, xlim = c(1,nbin), ylim= c(0, 1), axes=FALSE, ylab = '', xlab = '') axis(side=4, ylim= c(0, 1)) mtext("p-value",side=4,col="red",line=4) } p_tendency <- function(N, xmax, e1, b1, b2, nbin, nsim){ p_results <- matrix (data = 0, nrow = nsim, ncol = nbin) p_mean <- rep(0, nbin) for (i in 1: nsim){ df1 <- make_data(N, xmax, e1, b1, b2, nbin) p_results[i,] <- make_p_vals(df1, nbin) } p_mean <- colMeans(p_results) plot(y = p_mean, x = 1:nbin, main = 'average p-values', ylab = 'p-val', xlab = 'bin', type = 'b', pch = 16) return (p_results) } # this is what made the figures you see. N <- 300 xmax <- 1 e1 <- 3 # b1 <- 7 # slopes for the true relations. This is the strong one. b2 <- 1 # this is the weak one. Note that these are relative terms nbin <- 10 # number of bins of predictors nsim <- 100 # number of simulations to do p_tendency(N, xmax, e1, b1, b2, nbin, nsim) df1 <- make_data(N, xmax, e1, b1, b2, nbin) p_results <- make_p_vals(df1, nbin) pretty_plot(df1, p_results) [5.5] R code: n <- 300 x <- -(n/2):(n/2) e <- rnorm((n+1),0,50) y <- x + e plot (x =x, y = y) id <- x + y term <- rep.int(2, n+1) e1 <- rnorm ((n+1),0,10) id2 <- id + e1 ct1 <- 50 cut1 <- rep.int(-ct1, n+1) cut2 <- rep.int(ct1, n+1) term <- -1*(as.integer(id2 < cut1)) + as.integer(id2 > cut2) + 2 plot(x=x, y=y, pch = term, xlab = 'moral conservatism', ylab = 'economic conservatism') [5.6] R-code: library(lattice) library(gridExtra) G <- 4 # groups Ng <- 100 e1 <- rnorm(N, 0, .65) N <- Ng*G Xmean <- trunc(((1:N)-1)/Ng)+1 Ymean <- Xmean Gp = as.factor(Xmean) Xper <- rnorm(N, 0, .5) Xx <- Xmean + Xper Yy <- Ymean - Xper + e1 summary(lm(Yy ~ Xx)) myTheme <- standard.theme(col = FALSE) myTheme$superpose.symbol$pch <-1:7 p2 <- xyplot(Yy ~ Xx, group= Gp, par.settings = myTheme, cex = 1.2, xlab = list('X', cex = 1.5), ylab = list('Y', cex = 1.5), scales = list(cex = 1.5, font = 3), main = list("Simpsons", cex = 2)) grid.arrange(p2, ncol=1) p2 <- xyplot(Yy ~ Xx, group= Gp, par.settings = myTheme, cex = 1.2, xlab = list('X', cex = 1.5), ylab = list('Y', cex = 1.5), scales = list(cex = 1.5, font = 3), main = list("Simpsons", cex = 2), type = c("p", "r"), col.line = 'black', lwd = 3) grid.arrange(p2, ncol=1) [5.7] R-code: x1 <- runif(100, 5, 10) x2 <- runif(100, 15, 20) e1 <- rnorm(100, 0, .5) e2 <- rnorm(100, 0, 2.4) y1 <- x1+ 10 + e1 y2 <- x2 - 10 + e2 plot(x = x1, y = y1, xlab = 'x', ylab = 'y', xlim = c(0, 25), ylim = c(0, 25), pch = 16) abline(lm(y1~ x1), lwd = 2) par(new = T) plot(x = x2, y = y2, xlab = '', ylab = '', xlim = c(0, 25), ylim = c(0, 25), pch = 2) abline(lm(y2~ x2), lty = 3, lwd = 2) [5.8] R code: library(lme4) C <- 50 Nc <- vector(length = C) # will hold number of individuals in each area Nc <- trunc(runif(n = C, 50, 100)) N <- sum(Nc) Xc <- runif(n = C, 0, 100) # X value in each area ec <- 30 # error std dev e1 <- rnorm(n = C, 0, ec) Yc <- Xc + e1 # mean Y value in each area print('first ecological data') summary(lm(Yc ~ Xc)) f1 <- function(Xc, Nc, i) rep(x = Xc[i], times = Nc[i]) xx <- as.vector(unlist(lapply(1:C, f1, Xc = Xc, Nc = Nc))) # shared X ex <- rnorm(N, 0, 10) # reporting error on x xe <- xx + ex # individual observation ee <- as.vector(unlist(sapply(1:C, f1, Xc = e1, Nc = Nc))) # shared Y - X cc <- as.vector(unlist(sapply(1:C, f1, Xc = 1:C, Nc = Nc))) # fixed effect ec2 <- 70 # individual error e3 <- rnorm (n = N, 0, ec2) yy <- xx + ee + e3 # individual level y A3 <- data.frame(x = xe, y = yy, c = as.factor(cc)) print('naive individual data') summary(lm(A3$y ~ A3$x)) print('random effects individual') summary(glmer(A3$y ~ A3$x + (1|A3$c), family = gaussian)) summary(lmer(A3$y ~ A3$x + (1|A3$c))) print('fixed effects via dummies') print(summary(lm(y ~ x + c - 1, data = A3, na.action = na.omit))) A4 <- aggregate(A3, by = list(A3$c), FUN = mean) # now collapse over C A3$y2 <- A3$y - A4$y[A3$c] A3$x2 <- A3$x - A4$x[A3$c] print('fixed effects via centering') print(summary(lm(y ~ x2, data = A3, na.action = na.omit))) plot(x = xx, y = yy, xlab = 'true X values', ylab = 'observed Y values') points(x = Xc, y = Yc, cex = 2.5, pch = 16) [5.9] R Code: # this function is used to make a set of observations from m predictors, with # log odds ratios stored in LOR, and a set of frequencies, # and a base log odds ratio BO for the dependent variable make_obs <- function(M, LOR, freq, BO){ X <- expand.grid(data.frame(matrix(rep(0:1, M), nrow = 2))) X[X == 0] <- -1 Z <- apply(X,1, function(x) x*LOR/2) # multiple log odds ratios by 1 or -1 LO1 <- colSums(Z) # add these up and exponentiate for # overall odds ratio OR1 <- exp(LO1)*BO # and times the base odds ratio Prs <- OR1/(1+OR1) # turn to probabilities positive Mprs <- 1 - Prs # and negative PM <- cbind(Prs, Mprs) # put together, tranpose and make into # a column for dependent variable TPM <- t(PM) dim(TPM) <- c(2^(M+1),1) # reshape X <- expand.grid(data.frame(matrix(rep(0:1, M+1), nrow = 2))) obs <- round(freq*TPM) X <- cbind(X,obs) return(X) } # first let’s compare the parameters from a full model and one that omits # a predictor M <- 5 # number of independent variables LOR <- (runif(M,.5,2))*(sign(runif(M,-1,1))) # generate an odds ratio for each effect freq <- rep(100, 2^(M+1)) # constant frequency per category BO <- .8 # base odds X <- make_obs(M, LOR, freq, BO) # now make our data and fit two models model1 <- X$X1 ~ X$X2 + X$X3 + X$X4 + X$X5 + X$X6 model2 <- X$X1 ~ X$X2 + X$X3 + X$X4 + X$X5 G1 <- summary(glm(model1, weights = X$obs, family = "binomial")) G2 <- summary(glm(model2, weights = X$obs, family = "binomial")) G1NLOR <- -1*G1$coefficients[2:M,1] # flipping so they go the same way G2NLOR <- -1*G2$coefficients[2:M,1] # flipping so they go the same way X2 <- aggregate(X, by = list(X$X1,X$X2, X$X3,X$X4,X$X5), FUN = sum) # now collapse over last variable freq2 <- rep(200, 2^M) # constant freq/ category is twice as much X3 <- make_obs(M-1, G1NLOR, freq2, BO) # make from full model X4 <- make_obs(M-1, G2NLOR, freq2, BO) # and from partial print('Full model:') chisq.test(X2$obs, X3$obs) print('Partial model:') chisq.test(X2$obs, X4$obs) NLOR <- -1*LOR # flipping so they go the same way and # preparing to print a table of results tem <- append(G2$coefficients[2:M,1], NA) res <- cbind(NLOR,G1$coefficients[1:M+1,1],tem) colnames(res) <- cbind('Actual', 'Full', 'Partial') R1 <- lm(G1$coefficients[1:M+1,1] ~ NLOR) # and now plot both sets of # estimates against originals with lines plot(x = - LOR, y = G1$coefficients[1:M+1,1], xlab = 'actual', ylab = 'estimates', main = 'full model') abline(R1) R2 <- lm(G2$coefficients[2:M,1] ~ NLOR[1:M-1]) plot(x = - LOR[1:M-1], y = G2$coefficients[2:M,1], xlab = 'actual', ylab = 'estimates', main = 'partial model') abline(R2) # okay, now let’s prove that the comparison between two logits is the same # as that reached in a conventional loglinear model # first make another draw LORY <- (runif(M,.5,2))*(sign(runif(M,-1,1))) # generate an odds ratio for each effect freqY <- rep(100, 2^(M+1)) # constant frequency per category BO <- .8 # base odds Y <- make_obs(M, LORY, freqY, BO) # now make our data and fit two models # make two subpopulations X6A <- X[which(X$X6==0),] X6B <- Y[which(Y$X6==1),] XY <- rbind(X6A,X6B) model6A <- X6A$X1 ~ X6A$X2 + X6A$X3 + X6A$X4 + X6A$X5 GA1 <- summary(glm(model6A, weights = X6A$obs, family = "binomial")) model6B <- X6B$X1 ~ X6B$X2 + X6B$X3 + X6B$X4 + X6B$X5 GB1 <- summary(glm(model6B, weights = X6B$obs, family = "binomial")) GA1$coefficients[2] GB1$coefficients[2] # let's demonstrate that in fact the loglinear model gets the same cofficients as the logit model6AL <- X6A$obs ~ X6A$X1 + X6A$X2 + X6A$X3 + X6A$X4 + X6A$X5 + X6A$X1*X6A$X2 + X6A$X1*X6A$X3 + X6A$X1*X6A$X4 + X6A$X1*X6A$X5 + X6A$X2*X6A$X3 + X6A$X2*X6A$X4 + X6A$X2*X6A$X5 + X6A$X3*X6A$X4 + X6A$X3*X6A$X5 + X6A$X4*X6A$X5 + X6A$X2*X6A$X3*X6A$X4 + X6A$X2*X6A$X3*X6A$X5 + X6A$X2*X6A$X4*X6A$X5 + X6A$X3*X6A$X4*X6A$X5 + X6A$X2*X6A$X3*X6A$X4*X6A$X5 #model6AB <- loglm(model6AL, X6A) model6AG <- glm(model6AL, family = poisson, data = X6A) summary(model6AG) # look at the interaction between 1 and 2 (X6A$X1:X6A$X2); that's the effect of 2 on 1 # okay, this is going to get hairy, because we also need now to interact everything with X6 for the combined model model6ABL <- XY$obs ~ XY$X1 + XY$X2 + XY$X3 + XY$X4 + XY$X5 + XY$X6 + XY$X1*XY$X2 + XY$X1*XY$X3 + XY$X1*XY$X4 + XY$X1*XY$X5 + XY$X2*XY$X3 + XY$X2*XY$X4 + XY$X2*XY$X5 + XY$X3*XY$X4 + XY$X3*XY$X5 + XY$X4*XY$X5 + XY$X2*XY$X3*XY$X4 + XY$X2*XY$X3*XY$X5 + XY$X2*XY$X4*XY$X5 + XY$X3*XY$X4*XY$X5 + XY$X2*XY$X3*XY$X4*XY$X5 + XY$X1*XY$X6 + XY$X2*XY$X6 + XY$X3*XY$X6 + XY$X4*XY$X6 + XY$X5*XY$X6 + XY$X1*XY$X2*XY$X6 + XY$X1*XY$X3*XY$X6 + XY$X1*XY$X4*XY$X6 + XY$X1*XY$X5*XY$X6 + XY$X2*XY$X3*XY$X6 + XY$X2*XY$X4*XY$X6 + XY$X2*XY$X5*XY$X6 + XY$X3*XY$X4*XY$X6 + XY$X3*XY$X5*XY$X6 + XY$X4*XY$X5*XY$X6 + XY$X2*XY$X3*XY$X4*XY$X6 + XY$X2*XY$X3*XY$X5*XY$X6 + XY$X2*XY$X4*XY$X5*XY$X6 + XY$X3*XY$X4*XY$X5*XY$X6 + XY$X2*XY$X3*XY$X4*XY$X5*XY$X6 model6ABG <- glm(model6ABL, family = poisson, data = XY) summary(model6ABG) # look at the interaction between 1 and 2; that's the effect of 2 on 1 # notice XY$X1:XY$X2 is still what it was # now look at XY$X1:XY$X2:XY$X6 -1.833067 summary(model6ABG)$coefficients[27] # group A effect from logline # and compare to GB1$coefficients[2] - GA1$coefficients[2] [6.1] R-code: nc <- 50 # number of cities xx <- 1:nc nw <- trunc((1+exp(xx/10))*250) # number of women in each city urban <- (nw>1500) pb <- .1 # probability that any woman has a birth nb <- rbinom(nw, nw, pb) # number of babies plot (x = nw, y = nb, xlab = 'number of women', ylab = 'number of babies', cex.lab = 1.5, pch = 16, cex =2) rate <- nb/nw summary(lm(nb ~ urban)) summary(lm(nb ~ urban + nw)) summary(lm(rate ~ urban)) [6.2] R-code: pop <- 2000*exp((1:50)/10) pop <- seq(from = 50000, to = 1000000, length.out = 50) zoos <- trunc(sqrt(pop)/100) zpc <- zoos/pop crime <- pop/100000 summary(lm(crime ~ zpc)) summary(lm(crime ~ zoos)) xx <- (50:550)/50 plot(y = xx, x = xx, lty = 1, type = 'l', xlab = 'population', ylab = 'whatever', ylim=c(0,10), lwd =2) lines(y = sqrt(xx), x = xx, lty = 2, type = 'l', cex =2, lwd =2) # lines(y = log(xx), x = xx, lty = 2, type = 'l') lines(y = rep(1,501), x = xx, lty = 3, type = 'l', lwd = 2) legend('topleft', c('linear','square root','constant'), lty=1:3, bty='n', cex=1.5, lwd = 2) [6.3] R-code: M <- 4 # number of variables NC <- 500 # number of cities ev <- 10 # error variance tunable parameter df <- matrix(nrow = NC, ncol = (2*M+1)) # this is going to be the data frame we use p1 <- runif(NC, 1000, 1000000) # we randomly choose population of each city p2 <- p1/sum(p1) df[,(M+1)] <- p1 # this will hold the actual population #df[,(M+1)] <- trunc(exp(p1)) # this will hold the actual population # now for each variable, we make a particular exponential # transform of population (pexp) # make an error variance for this range, and then add a bit of noise, and # that's then our "independent variable" exp_list <- c(.25, 1, 3, .25) for (i in 1:M){ #exponent <- runif(1, .25, 1) exponent <- exp_list[i] pexp <- df[,(M+1)]^exponent evar <- (max(pexp)-min(pexp))/ev print(paste0('exponent ',i,' is ',exponent)) df[,i] <- trunc(pexp + rnorm(NC, 0, sd = evar)) df[,i + M + 1] <- df[,i]/df[,(M+1)] } df <- scale(df) pairs(df[,1:(M+1)]) # take a look print('Multiple variable naive') summary(lm(df[,1]~df[,2:M])) # naive ecological regression print('Multiple variable ratio DV TOO') summary(lm(df[,(M+2)]~df[,(M+3):(2*M+1)])) print('Multiple variable controlling for population') summary(lm(df[,1]~df[,2:M]+df[,(M+1)])) # controlling for population df_save_fixed <- df write.table(df_save_fixed, 'df_save_fixed.txt') local_reg <- function(df, M, window, do_plot = T){ if (window %% 2 != 1) window <- window + 1 # make it odd len_df <- nrow(df) start <- (window - 1)/2 stop <- len_df - start outlen <- stop - start + 1 out_b <- matrix(data = NA, nrow = outlen, ncol = M) df2 <- df[order(df[,(M+1)]),] for (i in start:stop){ df3 <- df2[(i-start + 1):(i + start),] bin_m <- lm(df3[,1]~df3[,2]) out_b[i-start+1,1] <- bin_m$coefficients[2] mult_m <- lm(df3[,1]~df3[,2:M]) out_b[i-start+1,2:M] <- mult_m$coefficients[2:M] } if(do_plot){ l_out_b <- nrow(out_b) plot(x = start:stop, y = out_b[,2], type = 'l', xlab = 'population', ylab = 'slope', main = 'multivariate') lines(x = start:stop, y = out_b[,3], type = 'l',lty = 2) lines(x = start:stop, y = out_b[,4], type = 'l',lty = 3) } return(out_b) } out_b <- local_reg(df_save_fixed, M = 4, window = 9) ptr <- 0 len1 <- length(seq(nrow(df_save_fixed)-5,7,-1)) out_smooth <- matrix(data=NA, nrow =len1, ncol = (M+1)) for (w in seq(nrow(df_save_fixed)-5,7,-1)){ ptr <- ptr + 1 print(paste0('working on ',w)) out_bw <- local_reg(df_save_fixed, M = 4, window = w, do_plot = F) out_smooth[ptr,1] <- w out_smooth[ptr, 2:(M+1)] <- apply(out_bw, 2, mean) if (ptr %% 20 == 0) write.table(out_smooth, 'out_smooth.dat') } plot(x = out_smooth[,1], y = out_smooth[,5], xlim = c(495,7), type = 'l', xlab = 'window', ylab = 'slope', main = 'multivariate') lines(x = out_smooth[,1], y = out_smooth[,4], xlim = c(495,7), type = 'l', lty = 2) lines(x = out_smooth[,1], y = out_smooth[,3], xlim = c(495,7), type = 'l', lty = 3) [6.4] R-code: # first figure GSW <- as.data.frame(read.table('gsw.txt', header = T)) cc <- cor(GSW$W_BLK, GSW$TOT90) plot (x = GSW$TOT90, y = GSW$W_BLK, xlab = 'district population', ylab = 'White on black hate crime', cex.lab = 1.5, pch = 16, cex =2) legend(x = 'topleft', legend = paste0('Correlation: ', format(cc, digits = 2))) GSW$NW <- GSW$TOT90*GSW$PW90 # recreate number of non-hispanic whites GSW$NB <- GSW$TOT90*GSW$PB90 # recreate number of non-hispanic whites # next figure -- crimes by n_whites cc <- cor(GSW$W_BLK, GSW$NW) plot (x = GSW$NW, y = GSW$W_BLK, xlab = 'number of whites', ylab = 'White on black hate crime', cex.lab = 1.5, pch = 16, cex =2) legend(x = 'topleft', legend = paste0('Correlation: ', format(cc, digits = 2))) # this next one isn't shown in paper but here if you want to # see the equivalent cc <- cor(GSW$W_BLK, GSW$NB) plot (x = GSW$NB, y = GSW$W_BLK, xlab = 'number of blacks', ylab = 'White on black hate crime', cex.lab = 1.5, pch = 16, cex =2) legend(x = 'topleft', legend = paste0('Correlation: ', format(cc, digits = 2))) # now by number of black-white dyads GSW$BWINT <- GSW$NW*GSW$NB/1000000 cc <- cor(GSW$W_BLK, GSW$BWINT) plot (x = GSW$BWINT, y = GSW$W_BLK, xlab = 'number of black-white dyads (millions)', ylab = 'White on black hate crime', cex.lab = 1.5, pch = 16, cex =2) legend(x = 'topleft', legend = paste0('Correlation: ', format(cc, digits = 2))) [6.5] R code: N <- 100 # number of cases lD <- runif(N, 1, 10) Dn <- exp(lD) r <- runif(N,.1,.9) Nm <- r*Dn lN <- log(Nm) line1 <- lm(lN ~ lD) r2 <- summary(line1)$r.squared plot(x = lD, y = lN, xlab='log of denominator', ylab = 'log of numerator', pch = 16) abline(line1, lty = 2, lwd = 2) legend(x = 'topleft', legend = paste0('R-sq: ', format(r2, digits=2))) [6.6] R-code: str(GSW) # take a look! library(MASS) # First, we add unknown and white perps together as in GSW. # If you don't do this, the interaction # isnt going to be significant. GSW$HC <- GSW$W_BLK + GSW$UK_BLK m1 <- glm.nb(HC ~ PW80 + CHNGBLK + XBLK, data = GSW) summary(m1) # let's for a second reverse the direction, # and look at black on white hate crime.... m1a <- glm.nb(B_WHT ~ PW80 + CHNGBLK + XBLK, data = GSW) # note that the coefficient is similar for black on white hate crime! # that doesnt make much sense given their theory! summary(m1a) # anyway, back to GSW. here is there model 2 m2 <- glm.nb(HC ~ PW80 + CHNGBLK + XBLK + LNTOT90, data = GSW) summary(m2) # I dont know why I had the district labeled as #47; # it is in the original data #2, # COMMUN values is 103.6 m2a <- glm.nb(HC ~ PW80 + CHNGBLK + XBLK + LNTOT90, data = GSW[-2,]) summary(m2a) # now the p-value it is marginally significant m1b <- glm.nb(HC ~ PW80 + CHNGBLK + XBLK, data = GSW[-2,]) summary(m1b) # even without the control for lntot90. # and here is model corresponding to 6.9 GSW$PB80sq <- GSW$PB80*GSW$PB80 m69 <- glm.nb(W_BLK ~ PB80 + PB90 + +PB80*PB90 + PB80sq, data = GSW[-2,]) summary(m69) # note that the fact that PB90 is almost the same as -1 x PB80:PB90 isnt necessarily # true; flipping and predicting B_WHT will show that m69b <- glm.nb(B_WHT ~ PB80 + PB90 + +PB80*PB90 + PB80sq, data = GSW[-2,]) summary(m69b) [6.7] R-code: Nschool <- 50 # number of schools NG <- 400 # number girls in a school #NB <- 400 # number boys in a school PB <- rep(runif(Nschool, .1, .9), each = NG) # make percent black at school level, but one for each individual record Id <- 1:(NG*Nschool) # make records for each School <- rep(1:Nschool, each = NG) # school number Race_girl <- as.numeric(runif(NG, 0,1) < PB) # assign race of each girl Race_bf <- as.numeric(runif(NG, 0,1) < PB) # assign race of boyfriend to every girl Cross <- 1 - as.numeric(Race_girl == Race_bf) Date_a <- data.frame(Id = Id, School = School, Race_girl, Race_bf, Cross = Cross) adf1 <- aggregate(Date_a, by = list(Date_a$School), FUN = mean) adf1$BWO_bf <- adf1$Race_bf/(1-adf1$Race_bf) adf1$CO_bf <- adf1$Cross/(1-adf1$Cross) adf1$PB_bf <- adf1$Race_bf adf1$LO_bf <- log(adf1$BWO_bf) adf1$LCO_bf <- log(adf1$CO_bf) Date_a$BWO_bf <- rep(adf1$BWO_bf, each = NG) Date_a$PB_bf <- rep(adf1$PB_bf, each = NG) Date_a$LO_bf <- rep(adf1$LO_bf, each = NG) Date_a$LCO_bf <- rep(adf1$LCO_bf, each = NG) Date_a$PDE <- Date_a$PB_bf*(1-Date_a$Race_girl) + (1-Date_a$PB_bf)*Date_a$Race_girl # proportion different ethnicity summary(glm(Cross ~ Race_girl + PDE, data = Date_a, family = binomial)) summary(glm(Race_bf ~ Race_girl + PB_bf, data = Date_a, family = binomial)) summary(glm(Race_bf ~ Race_girl + LO_bf, data = Date_a, family = binomial)) summary(glm(Cross ~ Race_girl + LCO_bf, data = Date_a, family = binomial)) summary(glm(Cross ~ PDE, data = Date_a, family = binomial)) summary(glm(Race_bf ~ PB_bf, data = Date_a, family = binomial)) #------------------- #loop prc <- rep(0, 500) loc <- rep(0, 500) for (i in 1:500){ Nschool <- 50 # number of schools NG <- 400 # number girls in a school PB <- rep(runif(Nschool, .1, .9), each = NG School <- rep(1:Nschool, each = NG) # school number Race_girl <- as.numeric(runif(NG, 0,1) < PB) Race_bf <- as.numeric(runif(NG, 0,1) < PB) Cross <- 1 - as.numeric(Race_girl == Race_bf) Date_a <- data.frame(Id = Id, School = School, Race_girl, Race_bf, Cross = Cross) adf1 <- aggregate(Date_a, by = list(Date_a$School), FUN = mean) adf1$BWO_bf <- adf1$Race_bf/(1-adf1$Race_bf) adf1$CO_bf <- adf1$Cross/(1-adf1$Cross) adf1$PB_bf <- adf1$Race_bf adf1$LO_bf <- log(adf1$BWO_bf) adf1$LCO_bf <- log(adf1$CO_bf) Date_a$BWO_bf <- rep(adf1$BWO_bf, each = NG) Date_a$PB_bf <- rep(adf1$PB_bf, each = NG) Date_a$LO_bf <- rep(adf1$LO_bf, each = NG) Date_a$LCO_bf <- rep(adf1$LCO_bf, each = NG) Date_a$PDE <- Date_a$PB_bf*(1-Date_a$Race_girl) + (1 Date_a$PB_bf)*Date_a$Race_girl prc[i] <- coef(glm(Race_bf ~ Race_girl + PB_bf, data = Date_a, family = binomial))[1] loc[i] <- coef(glm(Race_bf ~ Race_girl + LO_bf, data = Date_a, family = binomial))[1] } hist(prc, main = 'intercept, percentage method', ylab = 'frequency', xlab = 'value') hist(loc, main = 'intercept, log-odds method', ylab = 'frequency', xlab = 'value') [7.1] R code: time <- -50:50 nobs <- length(time) ee <- rnorm(nobs,0,.3) yy <- exp(-(time/25)^2) + ee ee <- rnorm(nobs,0,.3) xx <- exp(-(time/25)^2) + ee plot(x = time, y = yy) plot(x = time, y = xx) summary(lm(yy ~ xx)) lagnum <- 1 # choose how many times to lag xx by lagx <- rep(0, lagnum) lx <- c(lagx,xx[1:(nobs-lagnum)]) summary(lm(yy[lagnum+1:nobs] ~ lx[lagnum+1:nobs])) # instead, run time backwards by lagging y lagnum <- 1 # choose how many times to lag xx by lagy <- rep(0, lagnum) ly <- c(lagy,yy[1:(nobs-lagnum)]) summary(lm(ly[lagnum+1:nobs] ~ xx[lagnum+1:nobs])) # do lagged y predicting x, but we get it backwards ee <- rnorm(nobs,0,.3) x2 <- ly + ee summary(lm(yy ~ x2)) lagnum <- 1 # choose how many times to lag xx by lagx2 <- rep(0, lagnum) lx2 <- c(lagx2,x2[1:(nobs-lagnum)]) summary(lm(yy[lagnum+1:nobs] ~ lx2[lagnum+1:nobs])) [7.2] R code: # note first you must run the code in 7.1 # now show effects of increasing lag ntodo <- 20 bs <- rep(0, 2*ntodo-1) bptr <- 1 for (i in (-(ntodo-1)):(ntodo-1)){ lagnum <- i if(i>0){ lagx <- rep(0, lagnum) lx <- c(lagx,xx[1:(nobs-lagnum)]) bs[bptr]<- summary(lm(yy[lagnum+1:nobs] ~ lx[lagnum+1:nobs]))$coefficients[2] }else if(i<0){ lagx <- rep(0, -lagnum) lx <- c(xx[(-lagnum+1):nobs], lagx) bs[bptr]<- summary(lm(yy[1:(nobs+lagnum)] ~ lx[1:(nobs+lagnum)]))$coefficients[2] }else{ lx <- xx bs[bptr]<- summary(lm(yy[lagnum+1:nobs] ~ lx[lagnum+1:nobs]))$coefficients[2] } bptr <- bptr + 1 } plot(x = -(ntodo-1):(ntodo-1), y = bs, xlab = 'amount of lag', ylab = 'coefficient', type = 'b') [7.3] R code: fd <- function(x){ fd <- vector(length = length(x)-1) for (i in 2:length(x)){ fd[i - 1] <- x[i] - x[i-1] } return(fd) } lag <- function(x){ lag <- vector(length = length(x)-1) lag[1] <- NA for (i in 2:length(x)){ lag[i] <- x[i-1] } return(lag) } t <- 1:50 y <- 5 + (t^2)/100 x <- 30 - (t^2)/100 summary(lm(y ~ x)) summary(lm(fd(y) ~ lag(fd(x)), na.rm = T)) [7.4] R code: install.packages('MASS') library(MASS) # This is now with N different groups over T different times. # b is slope connecting time to the change # evar is the error variance on change; # frequency is for a cycle that goes up and down; if 0 is linear # xycor allows us to start with a correlation between x and y # this does the first simple linear model make_linTV <- function(N, T, b, evar, xycor = 0){ Sigma <- matrix(c(1,xycor,xycor,1),2,2) bb <- mvrnorm(n = N, rep(0, 2), Sigma, empirical = TRUE) Y <- matrix(data = NA, nrow = N, ncol = T) X <- matrix(data = NA, nrow = N, ncol = T) YD <- matrix(data = NA, nrow = N, ncol = T) XD <- matrix(data = NA, nrow = N, ncol = T) Y[,1] <- bb[,1] X[,1] <- bb[,2] YD[,1] <- NA XD[,1] <- NA for (i in 2:T){ ii <- i e3 <- rnorm(N,0, evar) e4 <- rnorm(N,0, evar) Y[,ii] <- Y[,1] + (ii-1)*b + e3 X[,ii] <- X[,1] + (ii-1)*b + e4 XD[,ii] <- X[,ii] - X[,(ii-1)] YD[,ii] <- Y[,ii] - Y[,(ii-1)] } YV <- as.vector(Y) XV <- as.vector(X) YDV <- as.vector(YD) XDV <- as.vector(XD) case <- rep(1:N, T) # this tells us which case is in which # position in vectorized form time <- rep(1:T, each = N) # now we make a matrix of fixed effects for vectorized form. FElen <- N*T FE1 <- matrix(data = 0, nrow = FElen, ncol = N) FE2 <- matrix(data = 0, nrow = FElen, ncol = T) for (i in 1:FElen){ FE1[i,case[i]] <- 1 FE2[i,time[i]] <- 1 } TV <- data.frame(ID = case, y = YV, x = XV, yd = YDV, xd = XDV, t = time, fec = FE1, fet = FE2) return(TV) } # this does the others.... # the way in which this works is we start with time 1 data, then compute a # difference, then add that on to the past time to make the next one make_TV <- function(N, T, b, evar, xycor = 0, frequency = 0){ if (frequency >0) {p <- 2*pi*(frequency/T)} Sigma <- matrix(c(1,xycor,xycor,1),2,2) bb <- mvrnorm(n = N, rep(0, 2), Sigma, empirical = TRUE) Y <- matrix(data = NA, nrow = N, ncol = T) X <- matrix(data = NA, nrow = N, ncol = T) YD <- matrix(data = NA, nrow = N, ncol = T) XD <- matrix(data = NA, nrow = N, ncol = T) Y[,1] <- bb[,1] X[,1] <- bb[,2] YD[,1] <- NA XD[,1] <- NA for (i in 2:T){ ii <- i #X[,i] <- X[,i-1] + e3 e3 <- rnorm(N,0, evar) e4 <- rnorm(N,0, evar) if (frequency >0) { XD[,i] <- e3 + b*sin(p*ii) YD[,i] <- e4 + b*sin(p*ii) }else{ XD[,i] <- e3 + b*ii YD[,i] <- e4 + b*ii } X[,i] <- X[,i-1] + XD[,i] Y[,i] <- Y[,i-1] + YD[,i] } YV <- as.vector(Y) XV <- as.vector(X) YDV <- as.vector(YD) XDV <- as.vector(XD) case <- rep(1:N, T) # this tells us which case is in which position # in vectorized form time <- rep(1:T, each = N) # now we make a matrix of fixed effects for vectorized form. FElen <- N*T FE1 <- matrix(data = 0, nrow = FElen, ncol = N) FE2 <- matrix(data = 0, nrow = FElen, ncol = T) for (i in 1:FElen){ FE1[i,case[i]] <- 1 FE2[i,time[i]] <- 1 } TV <- data.frame(ID = case, y = YV, x = XV, yd = YDV, xd = XDV, t = time, fec = FE1, fet = FE2) return(TV) } TV_output <- function(TV, N, T){ pairs(~ TV$x + TV$y + TV$t + TV$xd + TV$yd) f1names <- paste0('fec.', 1:(N-1)) f2names <- paste0('fet.',1: (T-1)) print('Here is a naive model') print(summary(lm(y ~ x, data = TV, na.action = na.omit))) print('Here is a model with time as a control') print(summary(lm(y ~ x + t, data = TV, na.action = na.omit))) print('Here is a first difference model') print(summary(lm(yd ~ xd, data = TV, na.action = na.omit))) print('Here is a case fixed effect model') Formula1 <- formula(paste("y ~ x +", paste(f1names, collapse=" + "))) print(summary(lm(Formula1, data = TV, na.action = na.omit))) print('Here is a time fixed effect model') Formula1 <- formula(paste("y ~ x +", paste(f2names, collapse=" + "))) print(summary(lm(Formula1, data = TV, na.action = na.omit)) ) print('Here is a time and case fixed effect model') Formula1 <- formula(paste("y ~ x +", paste(f1names, collapse=" + "), "+", paste(f2names, collapse=" + "))) summary(lm(Formula1, data = TV, na.action = na.omit)) } N <- 10 T <- 10 b <- 1 evar <- .4 # row 1 evar <- 2 xycor <- 0 TV <- make_linTV(N, T, b, evar, xycor = 0) TVLinTime <- TV TV_output(TV, N, T) # row 2 frequency <- 0 xycor = 0 evar <- 5 TV <- make_TV(N, T, b, evar, xycor, frequency) TV_output(TV, N, T) # row 3 xycor <- 0 frequency <- 2 evar <- .1 TV <- make_TV(N, T, b, evar, xycor, frequency) TV_output(TV, N, T) # row 4 xycor <- .4 TV <- make_TV(N, T, b, evar, xycor, frequency) TV_output(TV, N, T) [7.5] R-code: # First wave_try <- function(span, N, mu, size){ y <- rnbinom(N, mu = mu, size = size) x = 1:N raw_df <- data.frame(x = x, y = y) mod <- loess(y ~ x, data = raw_df, span = span) y_s <- predict(mod, newdata = data.frame(x = x)) smooth_df <- data.frame(x = x, y = y_s) f_lines <- plot(smooth_df, type = 'l', col = 'red', lwd = 2) f_lines <- plot(smooth_df, type = 'l', col = 'red', lwd = 2) par(new=T) f_dots <- plot(raw_df$y ~ raw_df$x, xlab = '', ylab = '', yaxt='n', col = 'grey75', pch = 16, cex = .75) par(new=F) } wave_try(span = .07, N = 200, mu = .7, size = 100) [7.6] Imagine that we have ten different processes: for example, they might generate different types of events. They all have some basic similarity, and so we can imagine that we’ll approach these data with the presupposition that they “sum to 1” in some sense. For example, they could be events that get reported on, and each type takes up a certain proportion of the newspaper. Or they could be bills that are proposed by different sources, and take up a certain amount of floor time. Or, more subtly, they are changes in the allocation of some general attention space, such as interest in one performer or genre as opposed to another, or one political cause as opposed to another. In such a case, we might not get information on all ten processes. We might not even realize that there is a trade-off in the attention space. Now take a look at Figure **: Here we are looking at the smoothed popularity of three of these different trends. The solid line and the light dotted tend to go up and down together, while they are opposite the heavier dashed line. From this, a theory is born! But really it in the raw data there is no periodicity at all—all are random. Further, we have only done the smoothing after there is a trade-off in the attention space. We probably don’t have any sense that we could be inducing a false correlation. And since these three processes only add up to (on average) 30% of the total, we aren't going to suspect that we are forcing some sort of negative correlation between the two. Yet we see this repeated periodicity of some going together, and some going against one another. Do you think maybe there really is periodicity, just by random chance? Figure 6 portrays a spectral analysis of the three curves. If there was periodicity, there’d be a clear peak here around the frequency of this periodicity. Instead, this is pink noise. Finally, smoothing tends to increase the rate of false positives. In a set of simulations (code provided below), I found that when regressing any pair of these variables, the p-value that I got using the smoothed data was more significant that that reached with the raw data around 60% of the time, and there were around one third more false positive findings using the smoothed data. In sum, we want to smooth data, but we need to do the same thing we did with lags—try out a large range of windows, from very small to very large, and see how our relationships vary across time. If we suddenly see relationships emerging or changing with different windows, that suggests that we aren't finding patterns in the data: we’re creating them. R-code: # Second wave_try2 <- function(span, N, mu, size, K, plots = T){ Y <- replicate(K, rnbinom(N , mu = mu, size = size)) # makes N by K matrix... Yc <- rowSums(Y) Z <- Y / Yc x = 1:N raw_df <- data.frame(x = x, y1 = Z[,1], y2 = Z[,2], y3 = Z[,3]) mod1 <- loess(y1 ~ x, data = raw_df, span = span) mod2 <- loess(y2 ~ x, data = raw_df, span = span) mod3 <- loess(y3 ~ x, data = raw_df, span = span) y_s1 <- predict(mod1, newdata = data.frame(x = x)) y_s2 <- predict(mod2, newdata = data.frame(x = x)) y_s3 <- predict(mod3, newdata = data.frame(x = x)) #smooth_df <- data.frame(x = x, y = y_s) if(plots){ f_lines <- plot(x = x, y = y_s1, ylab='Smoothed', type = 'l', col = 'grey75', lty =3, lwd = 4) par(new=T) f_lines <- lines(x = x, y = y_s2, yaxt='n',type = 'l', col = 'black', lwd = 2) f_lines <- lines(x = x, y = y_s3, yaxt='n',type = 'l', col = 'grey25', lty =2, lwd = 4) par(new=F) l1 <- spectrum(y_s1, type = 'l', main = 'Periodogram', col = 'grey75', lty =3, lwd = 4) par(new=T) l2 <- spectrum(y_s2, yaxt='n', main = '', type = 'l', col = 'black', lwd = 2) par(new=T) l3 <- spectrum(y_s3, yaxt='n', main = '', type = 'l', col = 'grey25', lty =2, lwd = 4) par(new=F) } outdf <- data.frame(x = x, ys1 = y_s1, ys2 = y_s2, ys3 = y_s3, y1 = raw_df$y1, y2 = raw_df$y2, y3 = raw_df$y3) } df1 <- wave_try2(span = .07, N = 200, mu = .7, size = 100, K = 10) # this next part tries to see whether smoothing increases the rate of false # positives....it isn’t printed in the text but feel free to experiment with N <- 200 to_do <- seq(N/10,N,10) # take slices of different sizes Nsim <- 100 ltd <- length(to_do) raw_p <- rep(0, ltd) # will hold the sum of significant p-valuse from raw data smooth_p <- rep(0, ltd) # will hold the sum of significant p-valuse from smoothed data total_less <- 0 # this will hold number of times that the smoothed p value is # less than the raw total_den <- 3*ltd*Nsim r <- rep(0,3) # will hold p-valuse from raw data s <- rep(0,3) # will hold p-valuse from smoothed data pc <- c(.05, .05, .05) # critical values ii <- 0 for (i in to_do){ ii <- ii + 1 for (j in 1:Nsim){ df1 <- wave_try2(span = .07, N = N, mu = .7, size = 100, K = 10, plots = F) r[1] <- summary(lm(df1$y1 ~ df1$y2))$coefficients[2,4] r[2] <- summary(lm(df1$y1 ~ df1$y3))$coefficients[2,4] r[3] <- summary(lm(df1$y2 ~ df1$y3))$coefficients[2,4] s[1] <- summary(lm(df1$ys1 ~ df1$ys2))$coefficients[2,4] s[2] <- summary(lm(df1$ys1 ~ df1$ys3))$coefficients[2,4] s[3] <- summary(lm(df1$ys2 ~ df1$ys3))$coefficients[2,4] raw_p[ii] <- raw_p[ii] + sum(as.integer((r < pc))) smooth_p[ii] <- smooth_p[ii] + sum(as.integer((s < pc))) total_less <- total_less + sum(as.integer(s < r)) } } raw_p <- raw_p/(3*Nsim) smooth_p <- smooth_p/(3*Nsim) pless <- total_less/total_den print(paste('In',round(pless*100),'percent of the comparisons, the smoothed was more significant than the raw')) ylmn <- min(smooth_p, raw_p) ylmx <- max(smooth_p, raw_p) ylims <- c(ylmn,ylmx) plot(x = to_do, y = smooth_p, type = 'l', xlab = 'number cases taken', ylab = 'proportion significant', ylim = ylims) lines(x=to_do, y = raw_p, type = 'l', lty = 2, ylim = ylims) legend('topright', legend = c('smoothed', 'raw'), lty = c(1,2)) [7.7] R-code: library(sp) library(spdep) library(geoR) # This supplies the grf maxx <- 50 # number of cells on side of grid; #now make spatially correlated data with grf() rd <- grf(maxx^2, grid = "reg", cov.model = 'gaussian', cov.pars = c(5, .05), nsim = 1) sdf1 <- data.frame(z1 = rd$data, coord.x = rep(1:maxx, times = maxx), coord.y = rep(1:maxx, each = maxx)) rd <- grf(maxx^2, grid = "reg", cov.model = 'gaussian', cov.pars = c(5, .05), nsim = 1) sdf1$z2 = rd$data rd <- grf(maxx^2, grid = "reg", cov.model = 'gaussian', cov.pars = c(5, .05), nsim = 1) sdf1$z3 = rd$data sdf1$e1 <- rnorm(maxx*maxx,0,1) # make a bit of error sdf1$x1 <- sdf1$z1 + sdf1$z3 # we add the spatially correlated pre-variables sdf1$x2 <- sdf1$z2 + sdf1$z3 # to make correlated x1 and x2 sdf1$y = sdf1$x1 + sdf1$x2 + sdf1$e1 # this is the true model. # Slopes for both are 1.0 summary(lm(sdf1$y ~ sdf1$x1)) # model 3 summary(lm(sdf1$y ~ sdf1$x1 + sdf1$x2)) # if you want to check that empirical # B's are darn close to 1.0. # make weights matrix, simple neighbor contiguity. Wnb <- cell2nb(nrow = maxx, ncol = maxx, type = 'queen', torus = FALSE) Wlw <- nb2listw(Wnb, style = 'W') # use spdep to make neighbors lag1 <- lagsarlm(formula = sdf1$y ~ sdf1$x1, data = sdf1, listw = Wlw) summary(lag1) # model 1 lag2 <- lagsarlm(formula = sdf1$y ~ sdf1$x1 + sdf1$x2, data = sdf1, listw = Wlw) summary(lag2) # model 2 error1 <- errorsarlm(formula = sdf1$y ~ sdf1$x1, data = sdf1, listw = Wlw) summary(error1) # model 4 error2 <- errorsarlm(formula = sdf1$y ~ sdf1$x1 + sdf1$x2, data = sdf1, listw = Wlw) summary(error2) # model 5 sdm1 <- lagsarlm(formula = sdf1$y ~ sdf1$x1, data = sdf1, listw = Wlw, type = 'mixed') summary(sdm1) # model 6 sdm2 <- lagsarlm(formula = sdf1$y ~ sdf1$x1 + sdf1$x2, data = sdf1, listw = Wlw, type = 'mixed') summary(sdm2) # confirm that SDM with both predictors also nails it. impacts(lag1, listw = Wlw) impacts(lag2, listw = Wlw) impacts(sdm1, listw = Wlw) # make the contour plot; start by making axis function mylevels2 <- seq(8500,0,-500) sdata <- sdf1$x1 dim(sdata) <- c(maxx, maxx) filled.contour(sdata, col=grey(seq(.9,0.2,length=length(mylevels2)))) [7.8] R-code: library(lattice) # this takes two sets of coordinates, each in the form of two rows, # horizontal and vertical p # positions, and computes the distances between them, # returning an NA by NB matrix dist_two_sets <- function(A1, B1){ Na <- ncol(A1) Nb <- ncol(B1) Dist1 <- matrix(data = 0, nrow = Na, ncol = Nb) for(i in 1:Na){ # you can take the boy out of FORTRAN.... for(j in 1:Nb){ # but you can't take the FORTRAN out of the boy.... Dist1[i,j] <- sqrt((A1[1,i]-B1[1,j])^2 + (A1[2,i]-B1[2,j])^2) } } return(Dist1) } # this puts anything in the boxes; # assumes that it gets something with first two # columns HCat and VCat, and others waiting to be populated. # If data_vec isn't null # it fills this in by multiplying what is in fine by data_vec sort_em <- function(fine, coarse, coarse_col, data_vec = NULL, fine_max_h = 0, fine_max_v = 0){ if (fine_max_h == 0) fine_max_h <- trunc(max(fine[1,])+1) if (fine_max_v == 0) fine_max_v <- trunc(max(fine[2,])+1) coarse_max_h <- max(coarse[,1]) coarse_max_v <- max(coarse[,2]) hdiv <- fine_max_h/coarse_max_h vdiv <- fine_max_v/coarse_max_v for (i in 1:ncol(fine)){ HC1 <- trunc(fine[1,i]/hdiv)+1 VC1 <- trunc(fine[2,i]/vdiv)+1 if(is.null(data_vec)){ add_me <- 1 }else{ add_me <- data_vec[i] } coarse[which((coarse[,1]==HC1)&(coarse[,2]==VC1)), coarse_col] <- coarse[which((coarse[,1]==HC1)&(coarse[,2]==VC1)), coarse_col] + add_me } return(coarse) } H1 <- 50 # size of horizontal dimension V1 <- 50 # size of vertical dimension R1h <- 20 # number of rasters in horizontal dimension R1v <- 20 # number of rasters in vertical dimension Np <- 10000 # number of persons Na <- 200 # number of amenities a Ca <- 20 # catchment size of a Nb <- 100 # number of amenities b Cb <- 20 # catchment size of b Ha <- runif(Na, 0, H1) # placement of a's on the horizontal dimension Va <- runif(Na, 0, V1) # placement of a's on the vertical dimension Hb <- runif(Nb, 0, H1) # placement of b's on the horizontal dimension Vb <- runif(Nb, 0, V1) # placement of b's on the vertical dimension Hp <- runif(Np, 0, H1) # placement of p's on the horizontal dimension Vp <- runif(Np, 0, V1) # placement of p's on the vertical dimension A1 <- rbind(Ha,Va) # this is the placements of amenities A dim(A1) <- c(2, Na) B1 <- rbind(Hb,Vb) # this is the placements of amenities B dim(B1) <- c(2, Nb) P1 <- rbind(Hp,Vp) # this is the placements of the persons dim(P1) <- c(2, Np) DA <- dist_two_sets(P1, A1) #rows are persons DB <- dist_two_sets(P1, B1) # columns amenities DA_close <- matrix(data = 0, nrow= Np, ncol = Na) DA_close[which(DANn) Tp <- .85 # probability of transmission of P Tn <- .85 # probability of transmission of N R3 <- runif(1000, 0, 1) # for transmission T2 <- T1*(R3Tn) R2 <- runif(1000, 0, 1) # for observations O2 <- T2*(R2Nn) R3 <- runif(1000, 0, 1) # for transmission T3 <- T2*(R3Tn) R2 <- runif(1000, 0, 1) # for observations O3 <- T3*(R2Nn) df2 <- as.data.frame(cbind(O1, O2, O3, T1, T2, T3)) model <- 'O3 ~ O2 + O1' summary(glm(model, family = "binomial")) [8.2] R code: # this first function just sets any variable to have a new mean and sd numnsd <- function(x, numn, nusd){ mn <- mean(x) sd <- sd(x) x2 <- (nusd/sd)*(x - mn) + numn return(x2) } Time1 <- runif(1000,0,1) mn <- mean(Time1) sd <- sd(Time1) e2 <- rnorm(1000,0,.3) x2 <- (1*(Time1)^4 + e2 + .5)/2 Time2 <- numnsd(x2, mn, sd) e3 <- rnorm(1000,.1,.1) x3 <- (1*(x2+.04)^4 + e3 + .5)/2 Time3 <- numnsd(x3, mn, sd) pairs(~ Time1 + Time2 + Time3) summary(lm(Time3 ~ Time2 + Time1)) [8.3] R-code: N <- 200 Trials <- 200 # this random error version forwards <- rep(0, Trials) backwards <- rep(0, Trials) val1 <- rep(0, 2*Trials) for (i in 1:Trials){ pr <- runif(N, 0, 1) trait1 <- rbinom(N, 1, pr) trait2 <- rbinom(N, 1, pr) r1 <- glm(trait1 ~ pr, family = binomial) r1s <- r1$residuals r2 <- glm(trait2 ~ r1s + trait1, family = binomial) forwards[i] <- r2$coefficients[2] r1b <- glm(trait2 ~ pr, family = binomial) r2s <- r1b$residuals r3 <- glm(trait1 ~ r2s + trait2, family = binomial) backwards[i] <- r3$coefficients[2] } par(mfrow=c(1,2)) mf <- mean(forwards) mb <- mean(backwards) hist(forwards, col =rgb(0,0,1,1/4), xlim = c(-2, 0), xlab = 'coefficient') legend('topleft', paste("Mean =", round(mf, 3))) hist(backwards, col =rgb(0,0,1,1/4), xlim = c(-2,0), xlab = 'coefficient') legend('topleft', paste("Mean =", round(mb, 3))) [8.4] R-code: ## Now the actual process b1 <- .25 forwards <- rep(0, Trials) backwards <- rep(0, Trials) val1 <- rep(0, 2*Trials) for (i in 1:Trials){ pr <- runif(N, 0, 1) ee <- rnorm(N, 0, .5) trait1 <- rbinom(N, 1, pr) r1 <- glm(trait1 ~ pr, family = binomial) r1s <- r1$residuals trait2 <- trait1 + as.numeric(pr>.85)*(1-trait1) – as.numeric(pr<.15)*(trait1) r2 <- glm(trait2 ~ r1s + trait1, family = binomial) forwards[i] <- r2$coefficients[2] r1b <- glm(trait2 ~ pr, family = binomial) r2s <- r1b$residuals r3 <- glm(trait1 ~ r2s + trait2, family = binomial) backwards[i] <- r3$coefficients[2] } par(mfrow=c(1,2)) mf <- mean(forwards) mb <- mean(backwards) hist(forwards, breaks = 30, col =rgb(0,0,1,1/4), xlim = c(-300,0), xlab = 'coefficient') legend('topleft', paste("Mean =", round(mf, 3))) hist(backwards, breaks = 30, col =rgb(0,0,1,1/4), xlim = c(0,100), xlab = 'coefficient') legend('topright', paste("Mean =", round(mb, 3))) [8.5] R code: library(igraph) # this determines, from any place, who we can get to next; # note that we do NOT allow replacement! valid_step <- function(X, cur, visited){ C1 <- which(X[cur,] == 1) C2 <- which(!visited[C1]) if (length(C2)>0) { C3 <- sample(C2, 1) }else{ C3 <- 0 } return(C3) } N <- 500 XG <- sample_pa(N, power = .8, directed = T, algorithm = 'psumtree') X <- as.matrix(XG) Nsim <- 100 clen <- 10 # length of chain to pursue Res <- rep(0, clen) # will hold results—the degree of node a this step den <- 0 # will be denominator for (i in 1:Nsim){ visited <- rep(F, N) # has this node been sampled yet? seed <- trunc(runif(1,1,N)) + 1 cur <- seed # cur is current node visited[cur] <- T if(degree(X, v = cur)>0){ Res[1] <- Res[1] + degree(X, v = cur) for (j in 2:clen){ if(degree(X, v = cur) > 1){ cur <- valid_step(X, cur, visited) if(cur != 0){ Res[j] <- Res[j] + degree(X, v = cur) }else{ next } } } if (cur == 0) next # this means the chain died } den <- den + 1 } Res <- Res/den plot(x = 1:clen, y = Res, type = 'l', lwd = 2, xlab = 'step', ylab = 'average degree') [8.5] R code: library(statnet) library(lattice) library(latticeExtra) library(sna) # We use this version of 'degree' library(reshape) # this is a general convenience function df_to_mat <- function(df, ego = df[,1], alter = df[,2], value =df[,3], symmetric = F, def_val = T, verbose = F){ N <- max(ego, alter) if (verbose) print(paste0('Making matrix of size ', N,' by ',N)) if(def_val){ X <- matrix(data = 0, nrow = N, ncol = N) }else{ X <- matrix(data = NA, nrow = N, ncol = N) } for (i in 1:(nrow(df))){ X[ego[i],alter[i]] <- value[i] if(symmetric) X[alter[i],ego[i]] <- value[i] } return(X) } # this is the preference of some i for any j compat <- function(i,j, P, S, pref_stat_exp = 1){ ff <- (S[j]^pref_stat_exp)/abs(P[i]-P[j]) return(ff) } # used by apply below; given a (row) vector, it gives us all the places # which have a 1 in them Ofunc <- function(row1){ Off <- which(row1 == 1) return(Off) } # this chooses whether or not the person w/personality P # does extra-curric with value V extra_cur <- function(P, V){ r1 <- P + rnorm(1,0,.1) choose <- (abs(V - r1) < .45) return(choose) } # Make social network # first here are the user settings N <- 100 # number people NC <- 4 # number of classes; that is, there are 4 per day NO <- 8 # options for each class; you could be in class 1, 2, ..., 8. NT <- 100 # number of times ***** NE <- 2 # number of edges ininitial graph, as multiple of number of nodes P_cut_off <- .66 # people this far apart can't be friends nf <- 8 # maximum number friendship nominations one will make drop_mult <- .3 # how likely to drop friends; higher is more likely pref_stat_exp <- 1.3 # exponent for status score in preference formula asym_add_rand <- F # when we add last aspirationals, is it random? additional <- 2 # number of extra aspirational nominations t obe made # now here it begins initializating P <- runif(N, 0, 1) # Personality type S <- runif(N, 0, 1) # status C <- matrix(data = NA, nrow = N, ncol = NC) # classes for (i in 1: NC){ C[,i] <- trunc(runif(N,1,(NO+1))) # assign each student to class } sports <- as.vector(unlist(lapply(P, extra_cur, V = 0))) mathletes <- as.vector(unlist(lapply(P, extra_cur, V = 1))) X <- rgnm(1, N, NE*N, mode = "graph", diag = FALSE) nt_t <- rep(0, NT) # this will hold number of new ties made sum_x <- rep(0, NT) for (t in 1:NT){ Old_X <- X Of <- matrix(data = 0, nrow = N, ncol = N) # matrix of offers for (ic in 1:NC){ for (i in 1:N){ slots <- nf - rowSums(X)[i] # how many open slots i has if (slots > 0){ classmates <- which(C[,ic] == C[i,ic]) classmates <- classmates[which(classmates != i)] pref_class <- classmates[order(compat(i,classmates,P,S, pref_stat_exp), decreasing = T)] Of[i,pref_class[1:slots]] <- 1 # the offers that will be made } } } # note now they only make one new tie per day during school hours. alloX <- rep(F, N) # This will hold who has a playdate already Asym <- Of*(t(1-Of)) # If we want to look at which ties are asymmetric... Of <- Of*t(Of) # only keep symmetric ties Of_1s <- apply(Of, MARGIN =1, FUN = Ofunc) # list of all chosen by ego nt_t[t] <- 0 # count of new ties made for (i in sample(N)){ ttt <- (!alloX[Of_1s[[i]]] == F) # who did i pick not yet allocated? at_risk <- Of_1s[[i]][which(ttt)] # these are those "at risk" if(length(at_risk>0)){ j <- sample(at_risk, size = 1) alloX[j] <- T X[i,j] <- 1 X[j,i] <- 1 nt_t[t] <- nt_t[t] + 1 } # we pick one of these and add a tie } # Then they have playdate. and if they have more than one friend, they can # invite two, who then can become friends. degX <- sna::degree(X, cmode = 'indegree') # recall X symmetric choosers <- which(degX > 1) choosers <- sample(choosers) # here we are doing it randomly; we can change for (i in choosers){ pos_invites <- which((X[i,]==1)&(!alloX)) n_invites <- trunc(runif(1,min(length(pos_invites),2), min(length(pos_invites),4))) if(n_invites >0){ invites <- sample(pos_invites, size = n_invites) alloX[invites] <- T if (n_invites > 1){ InvC <- combn(invites, 2) for (j in InvC){ if ((abs(P[InvC[1]]-P[InvC[2]]) < P_cut_off) & (degX[InvC[1]] (nf-1)) # deterministically drop least compatible for (i in topped){ excess <- degX[i] - nf + 1 friends <- Ofunc(X[i,]) pref_class <- friends[order(compat(i,friends,P,S, pref_stat_exp), decreasing = F)] drop <- pref_class[1:excess] for (j in drop){ X[i,j] <- 0 X[j,i] <- 0 nt_t[t] <- nt_t[t] - 1 } } sum_x[t] = sum(X) } # Now make final aspirational choices for (i in 1:N){ chosen <- which(X[i,] == 1) nchosen <- length(chosen) pdraw <- runif(nchosen,0,nchosen/nf) # random prob of being dropped drop_me <- S[chosen]0) # aspirational friendships i never completed Naspir <- min(ndropped, sum(Asym[i,])) # first way takes random sample, second takes in status order if(Naspir>0){ if (asym_add_rand){ add_these <- sample(aspir, Naspir) }else{ add_these <- aspir[order(S[aspir], decreasing = T)][1:Naspir] } X[i,add_these] <- 1 } } Xt <- t(X) # now we make a data frame for naïve logistic XX <- X %*% X # indirect paths pop <- colSums(X) # for when we go asymmetric Xdf <- as.data.frame(melt(X)) # turn result into data base for analysis Xtdf <- as.data.frame(melt(Xt)) XXdf <- as.data.frame(melt(XX)) # turn result into data base for analysis Xdf$recip <- Xtdf$value Xdf$stat1 <- S[Xdf$X1] Xdf$stat2 <- S[Xdf$X2] Xdf$pers1 <- P[Xdf$X1] Xdf$pers2 <- P[Xdf$X2] Xdf$tri <- XXdf$value Xdf$sports1 <- sports[Xdf$X1] Xdf$sports2 <- sports[Xdf$X2] Xdf$math1 <- mathletes[Xdf$X1] Xdf$math2 <- mathletes[Xdf$X2] Xdf$pop1 <- pop[Xdf$X1] Xdf$pop2 <- pop[Xdf$X2]-Xdf$value # we don't include this tie Xdf$persdif <- abs(Xdf$pers1 - Xdf$pers2) Xdf$both_math <- as.numeric(Xdf$math1&Xdf$math2) Xdf$both_sports <- as.numeric(Xdf$sports1&Xdf$sports2) # the first way is yes / no, second way, used, counts number Xdf$dif_ExC1 <- as.numeric(Xdf$math1&(!Xdf$math2) + (Xdf$sports1&(!Xdf$sports2)) + Xdf$math2&(!Xdf$math1) + (Xdf$sports2&(!Xdf$sports1))) Xdf$dif_ExC2 <- as.numeric(Xdf$math1&(!Xdf$math2)) + as.numeric(Xdf$sports1&(!Xdf$sports2)) + as.numeric(Xdf$math2&(!Xdf$math1)) + as.numeric(Xdf$sports2&(!Xdf$sports1)) Xdf$both_no_ExC <- as.numeric((!Xdf$math1)& (!Xdf$math2)&(!Xdf$sports1)&(!Xdf$sports2)) summary(glm(value ~ recip + pop2, data = Xdf, family = 'binomial')) summary(glm(value ~ recip + pop2 + pop1, data = Xdf, family = 'binomial')) summary(glm(value ~ recip + pop2 + pop1 + tri, data = Xdf, family = 'binomial')) summary(glm(value ~ pop2 + pop1 + tri, data = Xdf, family = 'binomial')) summary(glm(value ~ recip + pop2 + pop1 + tri + Xdf$both_math + Xdf$both_sports, data = Xdf, family = 'binomial')) summary(glm(value ~ recip + pop2 + pop1 + tri + Xdf$both_math + Xdf$both_sports + Xdf$both_no_ExC + Xdf$dif_ExC2, data = Xdf, family = 'binomial')) # now try ergm Ynet <- network(df_to_mat(Xdf, Xdf$X1, Xdf$X2, Xdf$value, symmetric = F, def_val = T)) Xbmath <- df_to_mat(Xdf, Xdf$X1, Xdf$X2, Xdf$both_math, symmetric = T, def_val = T) Xbsport <- df_to_mat(Xdf, Xdf$X1, Xdf$X2, Xdf$both_sports, symmetric = T, def_val = T) Xbnoexc <-df_to_mat(Xdf, Xdf$X1, Xdf$X2, Xdf$both_no_ExC, symmetric = T, def_val = T) Xdifexc <- df_to_mat(Xdf, Xdf$X1, Xdf$X2, Xdf$dif_ExC2, symmetric = T, def_val = T) Ynet %e% 'Xbsport' <- Xbsport Ynet %e% 'Xbmath' <- Xbmath Ynet %e% 'Xbnoexc' <- Xbnoexc Ynet %e% 'Xdifexc' <- Xdifexc # now we’ll try some ERGMs; Note that e1b will take a long time to fit! # when you do diagnostics, you’ll probably find even for models that # converge, the chains are “wandering” in an unhealthy manner. # I’ve let them go on for a long time, and they don’t settle down…. # gwesp is getting at triadic closure e1 <- ergm(Ynet ~ edges + gwidegree(1) + mutual) summary(e1) e1b <- ergm(Ynet ~ edges + gwidegree(decay = 1, fixed = TRUE) + mutual, control=control.ergm(MCMC.burnin=50000, MCMLE.maxit=50, MCMC.samplesize=4096, MCMC.interval=5000)) summary(e1b) e2 <- ergm(Ynet ~ mutual + gwidegree(1) +gwodegree(1) + gwesp + edges) summary(e2) e2t <- enformulate.curved(e2) e2_init <- c(e2t$theta[1:4], e2t$theta[6]) # take out alpha parameter e2b <- ergm(Ynet ~ mutual + gwidegree(1) +gwodegree(1) + gwesp + edges, control = control.ergm(init = e2_init)) summary(e2b) e3 <- ergm(Ynet ~ gwidegree(1) +gwodegree(1) + gwesp + edges) summary(e3) summary(e3) e4 <- ergm(Ynet ~ mutual + gwidegree(1) +gwodegree(1) + gwesp + edgecov(Xbsport) + edgecov(Xbmath) + edges, control = control.ergm(MCMLE.maxit=30, MCMC.interval = 1652)) summary(e4) e4t <- enformulate.curved(e4) e3t <- enformulate.curved(e3) e3_init <- c(e3t$theta[1:3], e4t$theta[4]) # take out alpha parameter e3b <- ergm(Ynet ~ gwidegree(1) +gwodegree(1) + gwesp + edges, control = control.ergm(init = e3_init)) e5 <- ergm(Ynet ~ mutual + gwidegree(1) +gwodegree(1) + gwesp + edgecov(Xbsport) + edgecov(Xbmath) + edgecov(Xbnoexc) + edgecov(Xdifexc) + edges, control = control.ergm(MCMLE.maxit=20, MCMC.interval = 2000, MCMC.samplesize = 2000)) e5b <- ergm(Ynet ~ mutual + gwidegree(1) +gwodegree(1) + gwesp + edgecov(Xbsport) + edgecov(Xbmath) + edgecov(Xdifexc) + edges, control = control.ergm(MCMLE.maxit=20, MCMC.interval = 2000, MCMC.samplesize = 2000)) e5b <- ergm(Ynet ~ mutual + gwidegree(1) +gwodegree(1) + gwesp + edgecov(Xbsport) + edgecov(Xbmath) + edgecov(Xdifexc) + edges, control = control.ergm(MCMLE.maxit=50, MCMC.interval = 4096, MCMC.samplesize = 5000)) summary(e5) mcmc.diagnostics(e5) # !!! NOTE THAT TO RUN 8.6 YOU NEED TO SAVE X! SO LETS SAY THIS: XG1 <- X [8.6] R code: library(igraph) panel.cor <- function(x, y, digits = 2, prefix = "", rsize = F, cex.cor, ...){ usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) rfac <- 1 if(rsize) rfac <- r txt <- format(c(r, 0.123456789), digits = digits)[1] txt <- paste0(prefix, txt) if(missing(cex.cor)) cex.cor <- 0.5/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * rfac) } nodeys <- function(G, directed = T){ bt <- betweenness(G) degi <- degree(G, mode = 'in') degb <- degree(G, mode = 'all') eig <- eigen_centrality(G, directed = directed) if(directed){ pairs(~ degi + degb + bt + eig$vector, labels = c('Degree in', 'Total degree', 'Betweenness', 'Eigenvector')) }else{ pairs(~ degb + bt + eig$vector, labels = c('Total degree', 'Betweenness', 'Eigenvector'), lower.panel = panel.smooth, upper.panel = panel.cor) } } N <- 500 p <- .1 XR <- erdos.renyi.game(N, p, type = c("gnp"), directed = F, loops = FALSE) nodeys(XR, directed = F) XP <- sample_pa(N, power = .5, m = 1, directed = F, algorithm = 'psumtree') nodeys(XP, directed = F) XSW <- sample_smallworld(1, N, 4, p/2) nodeys(XSW, directed = F) XG <- sample_grg(N, p/2) nodeys(XG, directed = F) # note XG1 is from Fake Network in previous code. nodeys(XG1, directed = F) [9.1] R-code: consist <- function(x,y){ num <- 0 den <- 0 for(i in 1:(length(x))){ num <- num + min(x[i],y[i]) den <- den + x[i] } consist <- num/den return(consist) } # fig 1 N <- 25 x <- runif(N,0, .5) y <- runif(N,.5,1) x[25] <-.5 y[25] <- .51 line1 <- lm(y ~ x) r2 <- summary(line1)$r.squared plot(x = x, y = y, type ='p', ylim = c(0,1), xlim = c(0,1), pch =16, cex = 1.5) abline(a = 0, b = 1, lty=2, lwd = 2) legend(x = 'bottomright', legend = paste0('R-sq: ', format(r2, digits=2))) # fig 2 N <- 50 x2 <- runif(N,0, 1) e <- rnorm(N,0, .1) y2 <- x2 + e line2 <- lm(y2 ~ x2) r22 <- summary(line2)$r.squared consist2 <- consist(x2,y2) plot(x = x2, y = y2, type ='p', ylim = c(0,1), xlim = c(0,1), pch =16, cex = 1.5) abline(a = 0, b = 1, lty=2, lwd = 2) legend(x = 'bottomright', legend = paste0('R-sq: ', format(r22, digits=2))) legend(x = 'topleft', legend = paste0('consistency: ', format(consist2, digits=2))) x3 <- x2 y3 <- 1-y2 line3 <- lm(y3 ~ x3) r23 <- summary(line3)$r.squared consist3 <- consist(x3,y3) plot(x = x3, y = y3, type ='p', ylim = c(0,1), xlim = c(0,1), pch =16, cex = 1.5) abline(a = 0, b = 1, lty=2, lwd = 2) legend(x = 'bottomleft', legend = paste0('R-sq: ', format(r23, digits=2))) legend(x = 'topright', legend = paste0('consistency: ', format(consist3, digits=2))) R-code [9.2] install.packages('lattice') library(lattice) N <- 2000 # Number of individuals G <- 50 # Size of grid iter <- 100 # number of iterations wdth <- 2 # size of window of neighbors nghbr <- function(A, i, G, wdth = 2){ n1 <- A$bel[(abs(A$hor - A$hor[i]) < wdth) & (abs(A$ver - A$ver[i]) < wdth)] return(n1) } MakeB <- function(A, BelMat, i){ BelMat[A$ver[i], A$hor[i]] <- BelMat[A$ver[i], A$hor[i]] + A$bel[i] return(BelMat) } MakeN <- function(A, NMat, i){ NMat[A$ver[i], A$hor[i]] <- NMat[A$ver[i], A$hor[i]] + 1 return(NMat) } UpdateA <- function(A, BelMat, i){ A$bel[i] <- BelMat[A$ver[i], A$hor[i]] } DoB <- function(A, G, N, iter, pf, rescale = F, res = 'Simulation'){ BelMat <- matrix(data = 0, nrow = G, ncol = G) NMat <- matrix(data = 0, nrow = G, ncol = G) for (j in 1:N){ BelMat <- MakeB(A, BelMat, j) NMat <- MakeN(A, NMat, j) } BelMat <- BelMat / NMat BelMat[which(NMat == 0)] <- 0 B2 <- t(BelMat[G:1,]) if(rescale){ b2min <- min(B2) b2max <- max(B2) b2scale <- b2max - b2min B2 <- 2*(B2-b2min) - 1 } at1 <- seq(-1, 1, 0.5) if(iter %% pf == 0) { print(levelplot(as.matrix(B2), col.regions = gray(0:100/100), at = at1, colorkey = list(at=at1, labels=list(at=at1), xlab = 'horizontal', ylab = 'vertical', scales=list(draw=FALSE), main = res), main = paste0('iteration ', iter))) } return(BelMat) } pA <- function (A, N){ AA <- c(A$hor, A$ver, A$bel) dim(AA) <- c(N,3) print(AA) } moveme <- function(A, BelMat, trade_off = .5){ G <- nrow(BelMat) NMat <- matrix(data = 0, nrow = G, ncol = G) N <- length(A$bel) for (j in 1:N){ NMat <- MakeN(A, NMat, j) } for (i in seq_along(A)){ alt_hor <- min(max(1,sample((A$hor[i]-1):(A$hor[i]+1), 1)),G) alt_ver <- min(max(1,sample((A$ver[i]-1):(A$ver[i]+1), 1)),G) home <- abs(BelMat[A$ver[i], A$hor[i]] - A$bel[i]) away <- abs(BelMat[alt_ver, alt_hor] - A$bel[i]) if(! (is.null(away))){ if(((home ==0)&(away == 0))|(home == away)){ home <- 1 away <- 1 } if(away != 0){ Odds <- ((home/away)^exp(trade_off))*((NMat[alt_ver, alt_hor]/NMat[A$ver[i], A$hor[i]])^exp(1-trade_off)) Paway <- Odds/(1+Odds) flip <- runif(1) }else{ flip <- 0 Paway <- 1 } if(Paway > flip){ A$ver[i] <- alt_ver A$hor[i] <- alt_hor } } } return(A) } sim1 <- function(iter, N, G, disc = F, samp = T, move = F, wdth = 2, pf = 5, trade_off = .5, rescale = F, print_me = -1){ out_B <- NULL Curve <- vector(length = iter) A <- list() A$hor <- trunc(runif(N, 1, G+1)) # position of individuals A$ver <- trunc(runif(N, 1, G+1)) A$bel <- runif(N, -1, 1) # initial belief if(disc) A$bel <- trunc(A$bel + sign(A$bel)*1) BelMat <- DoB(A,G,N, 0, 1, rescale = rescale) for (i in 1: iter){ p <- sample(N) # order to make updates for (j in 1:N){ if(length(nghbr(A, p[j], G))>1){ if(samp) { A$bel[p[j]] <- sample(nghbr(A, p[j], G), 1) }else{ A$bel[p[j]] <- mean(nghbr(A, p[j], G)) if(disc) A$bel[p[j]] <- sign(A$bel[p[j]])*1 } } } BelMat <- DoB(A,G,N, i, pf) if(i == print_me){ out_B <- BelMat } occ_med <- median(BelMat) Curve[i] <- mean(A$bel) if(abs(A$bel) > 1) { print(paste0('error in iter ', i)) print(paste0('and he is ', which(A$bel>0) )) break } if(var(A$bel) == 0) { print(paste0('Reached homogeneity at iteration ', i)) break } if(move) A <- moveme(A, BelMat, trade_off) } plot(x = 1: i, y = Curve[1:i], xlab = 'iteration', ylab = 'proportion positive', ylim = c(-1,1), col = 'grey75', lwd = 2) if(!is.null(out_B)) i <- out_B # we want to return this instead return(i) } S1 <- sim1(iter = 1000, N = 1000, G = 25, disc = T, samp = T, move = T, wdth = 2, pf = 100, trade_off = 0) S2 <- sim1(iter = 100, N =1000, G = 25, disc = F, samp = F, wdth = 2) S3 <- sim1(iter = 100, N = 1000, G = 25, disc = F, samp = F, wdth = 2, pf = 1) H1 <- sim1(iter = 20, N = 35, G = 4, disc = T, samp = T, wdth = 2) D1 <- sim1(iter = 20, N = 10, G = 4, disc = T, samp = F, wdth = 2) S4 <- sim1(iter = 1000, N = 2000, G = 20, disc = T, samp = T, move = T, wdth = 2) S5 <- sim1(iter = 1000, N = 2000, G = 20, disc = T, samp = T, move = T, wdth = 2, pf = 20, trade_off = 0) # make the graph showing decreased time ittook <- vector(length = 5) for (i in 2:6){ ittook[i] <- sim1(iter = 1000, N = 1000, G = 25, disc = T, samp = T, pf = 1000, wdth = i) } plot(x = 2:6, y = ittook[2:6], ylab = 'iterations to convergence', xlab = 'neighborhood', type = 'l', lwd = 3)