##This is my script file for Sept. 6, 2012 computer lab

##Open the Hayes/White data and look at it
hayes_white = read.table("C:/Users/snu/Desktop/Temp/HayesWhite_SubjectData.txt", header=TRUE)
head(hayes_white)
summary(hayes_white)
hayes_white$LogResponse
head(hayes_white$LogResponse)
names(hayes_white)

##Plot it
plot(hayes_white$Albright2009, hayes_white$LogResponse)
plot(hayes_white$Hwi_Score, hayes_white$LogResponse)
plot(hayes_white$Hwi_ConstraintWeight, hayes_white$LogResponse)
plot(hayes_white$TrialNum, hayes_white$LogResponse)
plot(hayes_white$Subject, hayes_white$LogResponse)
plot(hayes_white$TestOrControl, hayes_white$LogResponse)

##Make a linear regression model
hayes_white.lm1 = lm(hayes_white$LogResponse ~ hayes_white$Albright2009)
summary(hayes_white.lm1)
###resulting regression equation: y = 4.384 + 0.003757*x
##Let's try to find the prediction for different values of x (x=Albright2009):
4.384 + 0.003757*0
4.384 + 0.003757*0.00001
4.384 + 0.003757*0.0001
4.384 + 0.003757*0.001
4.384 + 0.003757*0.01
4.384 + 0.003757*0.1
##The slope is very small!! Similar values for y despite changing x
plot(hayes_white$Albright2009, hayes_white$LogResponse)
abline(hayes_white.lm1, col="blue")
##When we plot it,we can see that the slope is very small. y is always close to 4.4

##Make another linear regression model with a different indep. variable
hayes_white.lm2 = lm(hayes_white$LogResponse ~ hayes_white$Naturalness)
summary(hayes_white.lm2)
##resulting regression equation: y = 4.34108 + 0.15818*x
#This slope looks bigger...
#But what are the possible values of x?
summary(hayes_white$Naturalness) #just "natural" (0) or "unnatural" (1)
#So there are only two values to try:
4.34108 + 0.15818*0
4.34108 + 0.15818*1
#predicts a difference of about 0.15 log rating points between natural and unnatural
#I wonder what will happen if I try to plot this?
plot(hayes_white$Naturalness, hayes_white$LogResponse)
abline(hayes_white.lm2, col="red")


##Make another linear regression model with 2 indep. variables
hayes_white.lm3 = lm(hayes_white$LogResponse ~
    hayes_white$TestOrControl+ hayes_white$Naturalness)
summary(hayes_white.lm3)
##The resulting regression equation is: y = 4.72127 + -0.76370*x1 + 0.15984*x2
#But x1 (TestOrControl) and x2 (Naturalness) are both binary.
#So there are only four combinations of values to try
4.72127 + -0.76370*0 + 0.15984*0 #control, natural
4.72127 + -0.76370*0 + 0.15984*1 #control, unnatural
4.72127 + -0.76370*1 + 0.15984*0 #test, natural
4.72127 + -0.76370*1 + 0.15984*1 #test, unnatural
#There's a range of almost 1 log rating point
#But do all 4 combinations really occur?
table(hayes_white$TestOrControl, hayes_white$Naturalness) #yes, they do! OK.

##Make a linear regression model with an interaction term
hayes_white.lm4 = lm(hayes_white$LogResponse ~
    hayes_white$TestOrControl* hayes_white$Naturalness)
summary(hayes_white.lm4)
#regression equation is:
#y = 5.00320 + -1.33002*TestOrControl +
#     -0.40329*Naturalness +
#     1.12872*TestOrControl*Naturalness
#again there are only 4 values to try:
5.00320 + -1.33002*0 + -0.40329*0 + 1.12872*0*0 #control, natural
#interaction term doesn't matter

5.00320 + -1.33002*0 + -0.40329*1 + 1.12872*0*1 #control, unnatural
#interaction term doesn't matter (0*1=0!!)

5.00320 + -1.33002*1 + -0.40329*0 + 1.12872*1*0 #test, natural
#interaction term doesn't matter (0*1=0!!)

5.00320 + -1.33002*1 + -0.40329*1 + 1.12872*1*1 #test, unnatural
#predicted value is a little bigger
#-->Being test gives lower value than control
#Being unnatural gives lower value than natural
#But being test AND unnatural gives a higher value than otherwise expected

##This is a trick to be able to plot all 4 combinations
##I'm making a new variable that's a combination of two old variables
hayes_white$combined_variables = 
  factor(paste(hayes_white$TestOrControl, hayes_white$Naturalness))
head(hayes_white$combined_variables)
##Now I plot this new variable
plot(hayes_white$combined_variables, hayes_white$LogResponse)
summary(hayes_white)
abline(hayes_white.lm4, col="red")

########################3

##Read in Hungarian data and look at it
hungarian = read.table("C:/Users/snu/Desktop/Temp/03_Hungarian_no_vowels2.txt", header=TRUE)
summary(hungarian)
names(hungarian)

##Make a logistic regression model
hungarian.glm = glm(hungarian$is_output_nak ~
    hungarian$number_of_neutral_Vs
   + hungarian$last_V_height
   + hungarian$last_V_length
#   + hungarian$penult_V_height #by putting "#" at beginning
#   + hungarian$penult_V_length #I told R to ignore the line
   + hungarian$B_height
   + hungarian$B_length
   , family = binomial)
summary(hungarian.glm)
#regression equation is
# y = 1 / (1+exp(-1*(10.0169+-4.8679*num_of_Vs+
#   -3.1713*last_V_height+1.0355*last_V_length+
#   0.6517*B_height+0.1940*B_length)))
#I'll plug in values for just one imaginary item:
1 / (1+exp(-1*(10.0169+-4.8679*2+
   -3.1713*1+1.0355*0+
   0.6517*2+0.1940*1))) #2 neutral vowels, final is high
#and short; back V is high-mid and long
#Model predicts that 19.89611 % of words like this have -nak


##Make two models, one with a superset of the other's indep. variables
hungarian.glm_complex = glm(hungarian$is_output_nak ~
    hungarian$last_V_height
   + hungarian$last_V_length
   + hungarian$B_height
   , family = binomial)
summary(hungarian.glm_complex)

hungarian.glm_simpler = glm(hungarian$is_output_nak ~
    hungarian$last_V_height
   + hungarian$last_V_length
   , family = binomial)
summary(hungarian.glm_simpler)

##Use anova() to get a p-value from their likelihood ratio
anova(hungarian.glm_simpler, hungarian.glm_complex, 
	test="Chisq")