########################################################################## ## Preliminaries ########################################################################## ## Load necessary packages library(languageR) ## Get data into R. The perils here are to make sure everything is comma separated and that you designate comma ## as the separator. ## A caution on the input file: you have to put "NA" for every missing value in the input file. ## Also, whenever you take a mean or some other expression that would be affected by missing values, you must add ## the expression na.rm=TRUE so that it will compute means ignoring these values. MyData=read.table("C:/251Simulations/RTurk/RInputFile.csv", header=T, sep=",") head(data) ########################################################################## ## Basic descriptive statistics ########################################################################## ## This prints out a list of all the subject ID's levels(MyData$SubjectID) ## The following give you 2 x 2 tables allowing you to eyeball an effect size xtabs( ~ StemFinal + Tapped, data = MyData) ## Tapped ## StemFinal 0 1 ## 0 598 382 ## 1 81 101 ## This looks promising. xtabs( ~ AlveolarStopPrecedes + Tapped, data = MyData) ## Tapped ## AlveolarStopPrecedes 0 1 ## 0 585 406 ## 1 94 77 ## This does not look promising. xtabs( ~ AlveolarStopFollows + Tapped, data = MyData) ## Tapped ## AlveolarStopFollows 0 1 ## 0 588 450 ## 1 91 33 ## This looks promising. xtabs( ~ Pretonic + Tapped, data = MyData) ## Tapped ## Pretonic 0 1 ## 0 522 472 ## 1 157 11 ## This ought to be perfect; the 11 pretonic taps indicate possible problems (inaccurate transcription, ## nonnative speaker, hasty clicking, etc. xtabs( ~ PostTonic + Tapped, data = MyData) ## Tapped ## PostTonic 0 1 ## 0 588 364 ## 1 91 119 ## This looks promising. ## How flappatory are the words? Two different ways to get it. ## Method I MyXtabs = xtabs( ~ Word + Tapped, data = MyData) MyXtabs ## Proportions: MyPropTable = prop.table(MyXtabs, 1) MyPropTable ## Method II ## From above: whenever you take a mean or some other expression that would be affected by missing values, you must add ## the expression na.rm=TRUE so that it will compute means ignoring these values. tapply(MyData$Tapped, MyData$Word, mean, na.rm=TRUE) ## absurdity accredited activity alternative antiquity 0.42857143 0.71428571 0.42857143 0.42857143 0.85714286 anxiety argumentative atom authority barometer 0.85714286 0.57142857 0.76190476 0.42857143 0.42857143 beetle bigoted bulletin capital capitalism 0.80952381 0.71428571 0.14285714 0.57142857 0.42857143 capitalistic carpeting catheter cavity charlatan 0.85714286 0.85714286 0.71428571 0.57142857 0.16666667 chastity comforter commemorative commodity community 0.42857143 0.71428571 0.14285714 0.57142857 0.42857143 competence competent competitive competitor consecutive 0.16666667 0.00000000 0.14285714 0.42857143 0.57142857 conservative conspirator cosmopolitan creativity creditable 0.42857143 0.57142857 0.28571429 0.57142857 0.85714286 curator deputy derivative diameter digital 0.28571429 0.57142857 0.42857143 0.57142857 0.42857143 dramatist editor egotist entity equity 0.00000000 0.71428571 0.14285714 0.28571429 0.85714286 executor fallibility fugitive gelatin generator 0.42857143 0.85714286 0.00000000 0.00000000 0.59523810 helmeted heretic heritage hesitant humidity 0.28571429 0.57142857 0.28571429 0.00000000 0.71428571 hypnotist hypnotize identity impotent inevitable 0.14285714 0.04761905 0.50000000 0.14285714 1.00000000 ingenuity inhabitant inheritance inhibited inquisitor 0.57142857 0.14285714 0.28571429 0.42857143 0.57142857 intuitive irritant jupiter lavatory laxity 0.14285714 0.14285714 0.42857143 0.11904762 0.66666667 legibility liberality liberty limited liquidity 0.42857143 0.14285714 0.42857143 0.57142857 1.00000000 lucrative marital marketable Mediterranean metropolitan 0.42857143 0.28571429 0.71428571 0.14285714 0.00000000 militant militaristic military modality negative 0.14285714 0.14285714 0.09523810 0.57142857 0.14285714 nicety nutritive omnipotent parameter penitent 0.00000000 0.33333333 0.14285714 0.14285714 0.33333333 perimeter positive poverty predator preventative 0.42857143 0.28571429 0.57142857 0.57142857 0.28571429 profanity profitable property proton provocative 0.28571429 0.71428571 0.57142857 0.00000000 0.42857143 puberty puritan putative quality quantity 0.71428571 0.00000000 0.00000000 0.00000000 0.28571429 relative repetitive representative rickety samaritan 0.14285714 0.42857143 0.28571429 0.57142857 0.42857143 sanctity sedative selectivity separatist Sheraton 0.28571429 0.28571429 0.57142857 0.42857143 0.33333333 skeleton society spirited stupidity sweater 0.00000000 0.71428571 0.28571429 0.57142857 0.66666667 tentative theater thermometer university unlimited 0.57142857 0.85714286 0.71428571 0.42857143 0.71428571 validity velvety visitor 0.83333333 0.71428571 0.42857143 ## How flappatory are the subjects? ## Raw numbers: MyXtabs = xtabs( ~ SubjectID + Tapped, data = MyData) MyXtabs ## Proportions: MyPropTable = prop.table(MyXtabs, 1) MyPropTable ## other method: tapply(MyData$Tapped, MyData$SubjectID, mean, na.rm=TRUE) ########################################################################## ## Statistical testing with linear mixed effects model ########################################################################## ## We want to model the data in a way that suitably discounts for idiosyncracies of individual subjects ## and words. The latter are "random effects" and the ones we care about are "fixed effects". ## We'll start by creating a series of very simple regression models, with one fixed effect and subjects + words as random effects. ## Note that flunking this test is not conclusive: as we'll see, a fixed effect might actually perform better in concert with ## other fixed effects. ## Here are some important details (thanks to Robert for his help on both). ## We are modeling categorial data (tapped, not tapped). So we must do logistic regression. ## This is requested here by adding this parameter value: family=binomial ## Second, to be conservative, we should assume that the subjects' affection for tapping stem-final ## consonants is idiolect-specific. We do this by using (StemFinal|SubjectID) instead of (1|SubjectID). ## To illustrate, this is a failed model that wrongly claimed significance for StemFinal alone: MyBadLMer1 = lmer(Tapped ~ StemFinal + (1|SubjectID) + (1|Word), data=MyData, family=binomial) MyBadLMer1 ##The legit version is given immediately below. ##Ok, the series of models: ##Stem-final MyLMer = lmer(Tapped ~ StemFinal + (StemFinal|SubjectID) + (1|Word), data=MyData, family=binomial) MyLMer ## p was .057, not quite significant ## AlveolarStopPrecedes MyLMer = lmer(Tapped ~ AlveolarStopPrecedes + (AlveolarStopPrecedes|SubjectID) + (1|Word), data=MyData, family=binomial) MyLMer ## p was .382, not significant ## AlveolarStopFollows MyLMer = lmer(Tapped ~ AlveolarStopFollows + (AlveolarStopFollows|SubjectID) + (1|Word), data=MyData, family=binomial) MyLMer ## p was .0057, significant ## Pretonic MyLMer = lmer(Tapped ~ Pretonic + (Pretonic|SubjectID) + (1|Word), data=MyData, family=binomial) MyLMer ## p was 3.44e-6, highly significant ## Posttonic MyLMer = lmer(Tapped ~ PostTonic + (PostTonic|SubjectID) + (1|Word), data=MyData, family=binomial) MyLMer ## p was .563, not significant ## Frequency MyLMer = lmer(Tapped ~ Frequency + (Frequency|SubjectID) + (1|Word), data=MyData, family=binomial) MyLMer ## p was .3458, not significant ## LogFreq MyLMer = lmer(Tapped ~ LogFreq + (LogFreq|SubjectID) + (1|Word), data=MyData, family=binomial) MyLMer ## p was .688, not significant ## Now a kitchen-sink model. We use raw frequency because it worked better on its own. ## This big model takes about three minutes to run! MyLMerKitchenSink = lmer(Tapped ~ + StemFinal + AlveolarStopPrecedes + AlveolarStopFollows + Pretonic + PostTonic + Frequency + (StemFinal + AlveolarStopPrecedes+ AlveolarStopFollows + Pretonic + PostTonic + Frequency |SubjectID) + (1|Word), data=MyData, , family=binomial) MyLMerKitchenSink ## ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.3100489 0.1995929 -1.553 0.12033 ## StemFinal 1.1504260 0.4000813 2.875 0.00403 ** ## AlveolarStopPrecedes 0.0906628 0.2846493 0.319 0.75010 ## AlveolarStopFollows -1.6517273 0.4088755 -4.040 5.35e-05 *** ## Pretonic -4.1449222 0.7420142 -5.586 2.32e-08 *** ## PostTonic 1.0777263 0.6286592 1.714 0.08647 . ## Frequency 0.0001903 0.0001552 1.227 0.21992 ## Let's throw out the really crappy variables now. MyLMer = lmer(Tapped ~ StemFinal + AlveolarStopFollows + Pretonic + PostTonic + (StemFinal + AlveolarStopFollows + Pretonic + PostTonic|SubjectID) + (1|Word), data=MyData, , family=binomial) MyLMer ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.2048 0.1770 -1.157 0.24742 ## StemFinal 1.1410 0.4001 2.851 0.00435 ** ## AlveolarStopFollows -1.6200 0.3809 -4.253 2.11e-05 *** ## Pretonic -3.8041 0.6842 -5.560 2.70e-08 *** ## PostTonic 1.0099 0.6268 1.611 0.10713 ################################################## ## Using Anovas to compare models ################################################## ## When you put together a model of a process you can add as many parameters as you like, and each will almost certainy improve ## fit to the data. But often this improvement is an accident. You can detect the probability of this by using an appropriate ## test. Here, R asks for an "Anova" but in context this really means "likelihood ratio test" (see ## Baayen, p. xxx). ## Let's go with a slightly more conservative model as our base; leave out the dubious Posttonic environment. MySignatureLMer = lmer(Tapped ~ StemFinal + AlveolarStopFollows + Pretonic + (StemFinal + AlveolarStopFollows + Pretonic|SubjectID) + (1|Word), data=MyData, , family=binomial) MySignatureLMer ## ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.2357 0.1619 -1.456 0.14552 ## StemFinal 1.2986 0.4189 3.100 0.00194 ** ## AlveolarStopFollows -1.6925 0.4264 -3.970 7.19e-05 *** ## Pretonic -3.3065 0.6624 -4.992 5.98e-07 *** ## We'll give Posttonic one last chance. MyLMer = lmer(Tapped ~ StemFinal + AlveolarStopFollows + Pretonic + PostTonic + (StemFinal + AlveolarStopFollows + Pretonic + PostTonic|SubjectID) + (1|Word), data=MyData, , family=binomial) MyLMer ## The Anova test between the two just given. Intriguingly, it comes out enormously significant. ## p = 1.980e-10 ## Let's also give Frequency one last chance. MyLMer = lmer(Tapped ~ StemFinal + AlveolarStopFollows + Pretonic + Frequency + (StemFinal + AlveolarStopFollows + Pretonic + Frequency|SubjectID) + (1|Word), data=MyData, , family=binomial) MyLMer anova(MySignatureLMer, MyLMer) ## It flunks: ## p = .1863 ## We'll also try taking apparently legitimate factors out of the model, one by one, and see how much it hurts. ## StemFinal should be kept. MyDiminishedLMer = lmer(Tapped ~ AlveolarStopFollows + Pretonic + (AlveolarStopFollows + Pretonic|SubjectID) + (1|Word), data=MyData, , family=binomial) MyDiminishedLMer anova(MySignatureLMer, MyDiminishedLMer) ## p = 4.076e-05 ## AlveolarStopFollows should be kept. MyDiminishedLMer = lmer(Tapped ~ StemFinal + Pretonic + (StemFinal + Pretonic|SubjectID) + (1|Word), data=MyData, , family=binomial) MyDiminishedLMer anova(MySignatureLMer, MyDiminishedLMer) ## p = 5.353e-05 ## Pretonic should be kept. MyDiminishedLMer = lmer(Tapped ~ StemFinal + AlveolarStopFollows + (StemFinal + AlveolarStopFollows|SubjectID) + (1|Word), data=MyData, , family=binomial) MyDiminishedLMer anova(MySignatureLMer, MyDiminishedLMer) ## p = 8.737e-08 ## UPSHOT ## We're pretty sure that StemFinal, Pretonic, and AlveolarStopFollows are significant effects; not so sure about Pretonic; Frequency fails. ## FOR FUTURE USE ########################################################################## ##Modeling continuous data ########################################################################## ## It's still lmer(), but you leave out the tag family=binomial. ## Also, computing significance values is harder. There's no standard formula and you have to use a Monte Carlo ## simulation. This is available. MyLMer = lmer(Tapped ~ StemFinal + AlveolarStopFollows + Pretonic + PostTonic + (1|SubjectID) + (1|Word), data=MyData) MyPVals = pvals.fnc(MyLMer) MyPVals