/***************************************************************************************************/ /* AST NLSY79 dataset preparation */ /* Bryan S. Graham, NYU (w/ Dan Egel and Cristine Pinto) */ /* bsg1@nyu.edu */ /* March 2011 */ /***************************************************************************************************/ /***************************************************************************************************/ /* This do file and the accompanying Stata dictionary file report estimation results and figures */ /* presented in the paper "Auxiliary-to-Study Tilting". The data and do */ /* file are provided "as is". I am unable to assist with their interpretation or use. However */ /* please do feel free to e-mail me if you find any mistakes at bryan.graham@nyu.edu. */ /***************************************************************************************************/ /* use a semicolon as the command delimiter */ #delimit ; clear matrix; clear; set matsize 800; set memory 100m; /* Adjust the SOURCE_DATA directory to point to the location of the NLSY_BlkWhtGap.DCT dictionary file. Adjust the */ /* WRITE_DATA and DO_FILES directorys to point to the location of where you would like to write any created files and */ /* where you have placed this do file respectively. */ global SOURCE_DATA "C:\Documents and Settings\bsg1\My Documents\BSG_WORK_19W4th\Research\AST_11Spr\EmpiricalApplication\Source_Data"; global WRITE_DATA "C:\Documents and Settings\bsg1\My Documents\BSG_WORK_19W4th\Research\AST_11Spr\EmpiricalApplication\Created_Data"; global DO_FILES "C:\Documents and Settings\bsg1\My Documents\BSG_WORK_19W4th\Research\AST_11Spr\EmpiricalApplication\Stata_Do"; /* read in source data (extract from April 30, 2008 release for NLSY79) */ infile using "$SOURCE_DATA\NLSY_BlkWhtGap.DCT"; g HHID_79 = R0000149; /* household ID number (for `clustering') */ /* parents years of completed schooling at baseline */ g DadSch_in_79r = R0007900 if R0007900>=0; g MomSch_in_79r = R0006500 if R0006500>=0; /* Basic respondent demographics */ g usborn = (R0000700==1); g mother_usborn = (R0006100==1); g father_usborn = (R0007300==1); g male = (R0214800==1); g hispanic = (R0214700==1); g black = (R0214700==2); g born1962to1964 = (R0000500>=62); g yearborn = R0000500; g yearborn62 = (R0000500==62); g yearborn63 = (R0000500==63); g yearborn64 = (R0000500==64); /* sample weights */ g core_sample = (R0173600<=8 | R0173600==10 | R0173600==11 | R0173600==13 | R0173600==14); g male_blkwhthis_sample = (R0173600<=4 | R0173600==10 | R0173600==11); g sample_wgts = R0216100; /* age in base survey year */ g AgeIn1979 = R0000600; g Age13In1979 = (AgeIn1979==13); g Age14In1979 = (AgeIn1979==14); g Age15In1979 = (AgeIn1979==15); g Age16In1979 = (AgeIn1979==16); g Age17In1979 = (AgeIn1979==17); g Age18In1979 = (AgeIn1979==18); g Age19In1979 = (AgeIn1979==19); g Age20In1979 = (AgeIn1979==20); g Age21In1979 = (AgeIn1979==21); g Age22In1979 = (AgeIn1979==22); /* AFQT percentile */ g AFQT = R0618300 if R0618300>0; g AFQT_NoProb = (R0614800==51); /* AFQT score based on test with no reported "problems" */ g AFQT_Adj1 = AFQT if AFQT_NoProb==1; /* AFQT scores, problem free only */ g AFQT_Adj2 = invnormal(AFQT_Adj1/100) if AFQT_Adj1~=.; /* transform to approximate normality */ /* Calculate real annual earnings 1990 to 1993 (1993 prices) */ /* CPI with 1982-84 = 100: 1990: 130.7, 1991: 136.2, 1992: 140.3, 1993: 144.5 */ g earnings90 = R3559001*(144.5/130.7) if R3559001>=0; g earnings91 = R3897101*(144.5/136.2) if R3897101>=0; g earnings92 = R4295101*(144.5/140.3) if R4295101>=0; g earnings93 = R4982801 if R4982801>=0; egen AvgEarnings_90to93 = rowmean(earnings90 earnings91 earnings92 earnings93); g LogEarn = log(AvgEarnings_90to93); /* Calculate average hourly wages */ g wages90 = R3127800*(144.5/130.7) if R3127800>=100 & R3127800<=7500; g wages91 = R3523500*(144.5/136.2) if R3523500>=100 & R3523500<=7500; g wages92 = R3728500*(144.5/140.3) if R3728500>=100 & R3728500<=7500; g wages93 = R4416800 if R4416800>=100 & R4416800<=7500; egen AvgHourlyWages_90to93 = rowmean(wages90 wages91 wages92 wages93); g LogWage = log(AvgHourlyWages_90to93); /* replication of Neal and Johnson (1996) sample (without hispanics) */ g NJ_target_sample = (male_blkwhthis_sample==1 & born1962to1964==1 & hispanic ~= 1 & hispanic ~= .); g NJ_sample = (male_blkwhthis_sample==1 & born1962to1964==1 & hispanic ~= 1 & LogWage~=. & yearborn~=. & black ~= . & hispanic ~= . & AFQT_Adj1~=.); tab NJ_target_sample; tab NJ_sample; log using "$WRITE_DATA\NLSY79_Sample_Log", replace; log on; table black [pweight=sample_wgts], c(mean yearborn mean AFQT_Adj1 mean LogWage); /* summary statistics on black and white differences */ reg AvgHourlyWages_90to93 black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); reg LogWage black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); reg AFQT_Adj1 black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); reg yearborn62 black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); reg yearborn63 black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); reg yearborn64 black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); /* replicate Neal and Johnson (1996) basic finding */ reg LogWage yearborn62-yearborn64 black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79) nocons; reg LogWage yearborn62-yearborn64 black AFQT_Adj1 [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79) nocons; reg LogWage yearborn62-yearborn64 black AFQT_Adj2 [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79) nocons; /* compute estimates of average log hourly wages by race */ reg LogWage [pweight=sample_wgts] if NJ_sample==1 & black==1, cluster(HHID_79); reg LogWage [pweight=sample_wgts] if NJ_sample==1 & black==0, cluster(HHID_79); reg LogWage black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); /* compute estimates of CDFs of black and white wage distributions (and differences at various points) */ g t = (AvgHourlyWages_90to93<=500); reg t [pweight=sample_wgts] if NJ_sample==1 & black==1, cluster(HHID_79); reg t [pweight=sample_wgts] if NJ_sample==1 & black==0, cluster(HHID_79); reg t black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); replace t = (AvgHourlyWages_90to93<=750); reg t [pweight=sample_wgts] if NJ_sample==1 & black==1, cluster(HHID_79); reg t [pweight=sample_wgts] if NJ_sample==1 & black==0, cluster(HHID_79); reg t black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); replace t = (AvgHourlyWages_90to93<=1000); reg t [pweight=sample_wgts] if NJ_sample==1 & black==1, cluster(HHID_79); reg t [pweight=sample_wgts] if NJ_sample==1 & black==0, cluster(HHID_79); reg t black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); replace t = (AvgHourlyWages_90to93<=1250); reg t [pweight=sample_wgts] if NJ_sample==1 & black==1, cluster(HHID_79); reg t [pweight=sample_wgts] if NJ_sample==1 & black==0, cluster(HHID_79); reg t black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); replace t = (AvgHourlyWages_90to93<=1500); reg t [pweight=sample_wgts] if NJ_sample==1 & black==1, cluster(HHID_79); reg t [pweight=sample_wgts] if NJ_sample==1 & black==0, cluster(HHID_79); reg t black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); replace t = (AvgHourlyWages_90to93<=1750); reg t [pweight=sample_wgts] if NJ_sample==1 & black==1, cluster(HHID_79); reg t [pweight=sample_wgts] if NJ_sample==1 & black==0, cluster(HHID_79); reg t black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); replace t = (AvgHourlyWages_90to93<=2000); reg t [pweight=sample_wgts] if NJ_sample==1 & black==1, cluster(HHID_79); reg t [pweight=sample_wgts] if NJ_sample==1 & black==0, cluster(HHID_79); reg t black [pweight=sample_wgts] if NJ_sample==1, cluster(HHID_79); log off; log close; keep if NJ_sample==1; sort HHID_79; outsheet sample_wgts HHID_79 AvgHourlyWages_90to93 LogWage yearborn black AFQT_Adj1 if NJ_sample==1 using "$WRITE_DATA\NLSY79_Sample.out", replace; save "$WRITE_DATA\NLSY79_Sample.dta", replace;