您的位置:首页 > 理论基础

伍德里奇计量经济学导论之计算机操作题的R语言实现(虚拟变量)

2015-11-29 15:50 1641 查看

引言

前几章涉及的自变量都为定量变量,本章将讨论定性变自变量。主要内容包括:

单个虚拟变量区分两个组

g-1个虚拟变量来区分g个组

用虚拟变量来解释序数变量

虚拟变量与定量变量的交互作用,并应用邹至庄检验来检验各组是否存在显著差异

线性概率模型(本文不涉及)

本章Rmd文本以及所有整理好当数据见这里

C7.1 虚拟变量回归与联合变量显著检验

#(1)
data_gpa1<-read.csv('/home/wangjianlong/files/programs/college_life/econometrics/excel_data_1/gpa1.csv',header = T)
lm_gpa1<-lm(colGPA~PC+hsGPA+ACT+mothcoll+fathcoll,data = data_gpa1)
summary(lm_gpa1)
#colGPA=1.255554+0.151854 PC+ 0.450220hsGPA+0.007724ACT+-0.003758 mothcoll+0.041800fathcoll
#Adjusted R-squared:  0.1934 n=141
#当其他条件不变时,拥有PC的比不拥有PC的colGPA平均高出0.151854.
#给定0.05的显著性水平 PC的P值为0.011小于0.05,故PC是统计显著的。
#(2)
lm_gpa1_1<-lm(colGPA~PC+hsGPA+ACT,data = data_gpa1)#求约束方程的R squared
summary(lm_gpa1_1)# R squared 为0.2194
##F-value =
((0.2222-0.2194)/2)/((1-0.2222)/135)#0.2429931
##p value 为
1-pf(0.2429931,2,135)#0.7846192
##由于P值较大,可以说两个变量联合不显著
#(3)
lm_gpa1_2<-lm(colGPA~PC+hsGPA+ACT+mothcoll+fathcoll+I(hsGPA^2),data = data_gpa1)
summary(lm_gpa1_2)
##没必要进行扩展 ,一方面加入后一次项和二次项变得不显著
##另一方面,hsGPA呈现出U形变化,在hsGPA=2.68出现转折,这不好解释


C7.2二次项变量以及交互虚拟变量设定

#(1)
data_wage2<-read.csv('/home/wangjianlong/files/programs/college_life/econometrics/excel_data_1/wage2.csv',header = T)
lm_wage2<-lm(log(wage)~educ+exper+tenure+married+black+south+urban,data =data_wage2)
summary(lm_wage2)
##log(wage)=5.395497+0.065431educ+0.014043 exper+0.011747 tenure+0.199417married-0.188350black-0.090904south+0.183912urban
##Adjusted R-squared:  0.2469 n=935
##在其他条件保持不变的情况下,平均来说,黑人比非黑人工资少18.8%.
#(2)
lm_wage2_1<-lm(log(wage)~educ+exper+tenure+married+black+south+urban+I(exper^2)+I(tenure^2),data =data_wage2)
summary(lm_wage2_1)#R-squared 为0.255
#F值为
((0.255-0.2526)/2)/((1- 0.255)/925)#1.489933
1-pf(1.489933,2,925)#p值为0.2259282
#即使给予20%的显著性水平,由于P值大于20%,不拒绝原假设,所以它们不是联合显著的
#(3)
lm_wage2_2<-lm(log(wage)~educ+exper+tenure+married+black+south+urban+black:educ,data =data_wage2)
summary(lm_wage2_2)
##交互项表明同样多增加一年的教育,平均来说,与黑人的教育回报比非黑人少2.3%.
##但由于P值较大,没有理由拒绝原假设,故种族并不影响教育回报率。
#(4)
##我们选择单身非黑人为基组
data_wage2$black<-as.factor(data_wage2$black)#black因子化
data_wage2$married<-as.factor(data_wage2$married)#married因子化
dmy<-dummyVars(~lwage+educ+exper+tenure+south+urban+black:married,data=data_wage2)
data_wage2_1<-data.frame(predict(dmy,newdata=data_wage2))
#改名字
names(data_wage2_1)[7]<-'single_nonblack'
names(data_wage2_1)[8]<-'single_black'
names(data_wage2_1)[9]<-'married_nonblack'
names(data_wage2_1)[10]<-'married_black'
lm_wage2_3<-lm(lwage~educ+exper+tenure+south+urban+single_black+married_nonblack+married_black,data =data_wage2_1)
summary(lm_wage2_3)
##lwage=5.403793+ 0.065475educ+0.014146exper+0.011663tenure-0.091989south+0.184350urban-0.240820single_black+0.188915married_nonblack+0.009448married_black
##已婚黑人与非已婚黑人相差
0.009448- 0.188915  #0.18
#工资回报相差18%


C7.3 对数函数系数当精确解释以及联合变量检验

data_mlb1<-read.csv('/home/wangjianlong/files/programs/college_life/econometrics/excel_data_1/mlb1.csv',header = T)
lm_mlb1<-lm(log(salary)~years+Gamesyr+bavg+hrunsyr+rbisyr+runsyr+fldperc+allstar+frstbase+scndbase+thrdbase+shrtstop+Catcher,data=data_mlb1)
summary(lm_mlb1)
##Catcher的p值为0.054,给定0.06的显著性水平,也能拒绝原假设
##保持其他变量不变,平均来说,接球手比非接球手工资高
100*(exp(0.25)-1)#28.40254%
##平均高28%这个差异确实比较大
#(2)
##假设:h0:b9=b10=b11=b12=b13=0
lm_mlb1_1<-lm(log(salary)~years+Gamesyr+bavg+hrunsyr+rbisyr+runsyr+fldperc+allstar,data=data_mlb1)#受约束方程当R squared
summary(lm_mlb1_1)#0.6445
((0.6535-0.6445)/5)/((1-0.6535)/344)#1.787013
#P值为
1-pf(1.787013,5,344)#0.114809
##给定5%当显著性水平,不拒绝原假设,即各个位置当平均薪水没有什么差别
#(3)
#两者给出的结论大致相同,因为题2中当给出当强度很弱


C7.4 交互虚拟变量设定

data_gpa2<-read.csv('/home/wangjianlong/files/programs/college_life/econometrics/excel_data_1/gpa2.csv',header = T)
#(1)
#B3与B4可以确定,B3是负的,B4是正的
#一般我们会考虑运动员成绩相比非运动员成绩差点,B6是负的
#性别和学校毕业人数不好说
#(2)
lm_gpa2<-lm(colgpa~hsize+I(hsize^2)+Hsperc+sat+female+athlete,data=data_gpa2)
summary(lm_gpa2)
##colgpa=1.241-0.05685hsize-0.00467I(hsize^2)-0.0132Hsperc+0.00164sat+0.1549female+0.01693athlete
#Adjusted R-squared:  0.2915 n=4137
##在其他条件保持不变当情况下,运动员成绩比非运动员成绩高0.1693
##由于p值较小,拒绝b6为0当假设,认为b6在统计上是显著当。
#(2)
lm_gpa2_1<-lm(colgpa~hsize+I(hsize^2)+Hsperc+female+athlete,data=data_gpa2)
summary(lm_gpa2_1)
#去掉sta以后,athlete系数当P值变得相当大,已经没有理由去拒绝原假设。
#原因是当我们不控制sat变量时,平均来讲运动员实际分数比非运动员低。
#(3)
data_gpa2$female<-as.factor(data_gpa2$female)#black因子化
data_gpa2$athlete<-as.factor(data_gpa2$athlete)#married因子化
dmy<-dummyVars(~colgpa+hsize+I(hsize^2)+Hsperc+sat+female:athlete,data=data_gpa2)
data_gpa2_1<-data.frame(predict(dmy,newdata=data_gpa2))
head(data_gpa2_1)
#改名字
names(data_gpa2_1)[6]<-'male_nonath'
names(data_gpa2_1)[7]<-'female_nonath'
names(data_gpa2_1)[8]<-'male_ath'
names(data_gpa2_1)[9]<-'female_ath'
##我们选择female_nonath作为基组
lm_gpa2_2<-lm(colgpa~hsize+I(hsize^2)+Hsperc+sat+male_nonath+male_ath+female_ath,data=data_gpa2_1)
summary(lm_gpa2_2)#
##保持其他变量不变,女生运动员比非女生运动员当colgpa平均高0.01751
#(5)
lm_gpa2_3<-lm(colgpa~hsize+I(hsize^2)+Hsperc+sat+female+athlete+female:sat,data=data_gpa2)
summary(lm_gpa2_3)
##由于P值太大不能拒绝两者交互的系数为0当原假设
##且系数较小,即使加入作用也非常有限


C7.5略

C7.6 邹至庄检验两个方程是否相等

data_sleep75<-read.csv('/home/wangjianlong/files/programs/college_life/econometrics/excel_data_1/sleep75.csv',header = T)
male_sleep75<-data_sleep75[which(data_sleep75[,'Male']==1),]
female_sleep75<-data_sleep75[which(data_sleep75[,'Male']!=1),]
##男性估计
lm_sleep75_male<-lm(sleep~totwrk+educ+age+I(age^2)+yngkid,data = male_sleep75)
summary(lm_sleep75_male)
#sleep=3648.20826 -0.18212totwrk-13.05238educ+ 7.15659age-0.04477I(age^2)+ 60.38021yngkid
##女性估计
lm_sleep75_female<-lm(sleep~totwrk+educ+age+I(age^2)+yngkid,data = female_sleep75)
summary(lm_sleep75_female)
#sleep=4238.72933 -0.13995totwrk-10.20514educ-30.35657age- 0.36794I(age^2)-118.28256yngkid
##截距项与孩子个数对睡眠时间有较大影响
#(2) 题目不太理解,所以直接对第一题当方程进行了邹至庄检验
library(gap)#
x1<-data.frame(totwrk=male_sleep75$totwrk,educ=male_sleep75$educ,educ2=(male_sleep75$educ)^2,yngkid=male_sleep75$yngkid)
x2<-data.frame(totwrk=female_sleep75$totwrk,educ=female_sleep75$educ,educ2=(female_sleep75$educ)^2,yngkid=female_sleep75$yngkid)
chow.test(male_sleep75$sleep,as.matrix(x1),female_sleep75$sleep,as.matrix(x2))#邹至庄检验
#由于P值为0.035小于5%当显著性水平,故拒绝原假设,认为两者睡眠方程不相等。
#(3)
lm_sleep75<-lm(sleep~Male+totwrk+educ+age+I(age^2)+yngkid+Male*totwrk+Male*educ+Male*age+Male*I(age^2),data = data_sleep75)#未约束方程
summary(lm_sleep75)#R-squared:  0.1272
lm_sleep75_1<-lm(sleep~Male+totwrk+educ+age+I(age^2)+yngkid,data = data_sleep75)#约束方程
summary(lm_sleep75_1)#R-squared:  0.1228
((0.1272-0.1228)/5)/((1-0.1272)/695)#F临界值0.7007333
1-pf(0.7007333,5,695)#p值为0.6230248
#由于P值较大,所以不是联合显著的


剩下的题目大同小异,这里不再赘述。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: