/*****************************************************************************

This do file merges endline outcomes with baseline covariates 
to create a final analysis dataset for the impact evaluation paper of SPOON Guatemala

Updated on 9/13/2022

*****************************************************************************/

******************** CHILD DATA ****************************
** ONLY TARGET

use "${el_processed}/child_target_el.dta", clear // 1290 children
drop id_caregiver
gen double id_caregiver = momID
replace id_caregiver=dadID if id_caregiver==. // if momID is missing, use dad ID
gen double targetID = id_persona

gen child_age = edad_dias/(365.25/12) // use precise age from anthro data
label var child_age "Child's age in months"

* create age intervals
egen child_age_cat = cut(child_age), at(24,30,32,33,34,35,36,37,38,39,40,42, 44) label

rename male child_sex
lab var child_sex "Child's sex"

drop age
 
* merge ENDLINE caregiver age and education 
merge m:1 folio id_caregiver using "${el_processed}/caregiver_target_el.dta", keepusing(age educ_prim_c)
gen not_matched = (_merge==1) // 20 didn't match, probably because not mom or dad who is primary caregiver
drop if _merge==2
drop _merge

rename age cg_age
lab var cg_age "Caregiver age"

rename educ_prim_c cg_educ
lab var cg_educ "Caregiver primary education"

* save list of IDs that don't match to try to get caregiver info (missing IDs) -- 20 total, all in separate houses
	preserve 
	keep if not_matched==1
	keep folio id_persona momID
	rename id_persona targetID
	save "${el_processed}/caregivers_nomatch.dta", replace
	restore

preserve
use "${el_processed}/caregiver_target_el.dta", clear
drop targetID 
* folio 61091 has 2 caregivers with information. keep mom instead of grandma
drop if folio==61091 & id_persona==2
merge m:1 folio using "${el_processed}/caregivers_nomatch.dta" // 17 match, 3 are missing knowledge data...
keep if _merge==2 | _merge==3 // 17 matched
drop _merge
replace id_persona = momID if id_persona==.
rename age age_original
rename educ_prim_c educ_prim_c_original
merge 1:1 folio id_persona using "${el_processed}/demographics_el.dta", keepusing(age educ_prim_c) // get age and education from demographics survey for those 3
keep if _merge==3
replace id_caregiver = id_persona if id_caregiver==.
keep folio id_persona age educ_prim_c targetID id_caregiver
rename age age_new 
rename educ_prim_c educ_prim_c_new
rename id_caregiver id_caregiver_new
save "${el_processed}/caregivers_nomatch_demographics.dta", replace

restore

merge 1:1 folio targetID using "${el_processed}/caregivers_nomatch_demographics.dta" // match on target instead of on momID
replace id_caregiver = id_caregiver_new if _merge==3
replace cg_age = age_new if cg_age==.
replace cg_educ = educ_prim_c_new if cg_educ==.
drop _merge

* add caregiver/household baseline characteristics
rename folio idunico
gen double idpersona = id_caregiver

rename practice_index practice_index_el

merge m:1 idunico idpersona using "${bl_processed}/mom.dta", keepusing(idpersona idunico decisiones grit rosenberg inc_h wealth prop_males0_5 prop_males6_18 prop_males19_49 prop_males50  prop_females0_5 prop_females6_18 prop_females19_49 prop_females50 interview_date) // 79 don't match... mostly new households?

tab el_only if _merge==1 // 61 are new households ; other 18 have different caregiver ID identified at baseline
drop if _merge==2 // drop baseline only
drop _merge

* rename all baseline variables so it's clear, and impute missing with control mean
foreach var in decisiones grit rosenberg inc_h wealth prop_males0_5 prop_males6_18 prop_males19_49 prop_males50  prop_females0_5 prop_females6_18 prop_females19_49 prop_females50 interview_date {
	rename `var' `var'_bl
	sum `var'_bl if tratamiento_hogar==3
	local mean = `r(mean)'
	gen `var'_bl_imputed =(`var'_bl==.)
	replace `var'_bl=`mean' if `var'_bl==.
}

	* same people missing all household characteristics -- make just one variable for those missing and drop others
	rename wealth_bl_imputed hh_imputed
	drop prop_males0_5_bl_imputed prop_males6_18_bl_imputed prop_males19_49_bl_imputed prop_males50_bl_imputed prop_females0_5_bl_imputed prop_females6_18_bl_imputed prop_females50_bl_imputed
	
	
rename practice_index_el practice_index

* calculate age for those alive at baseline
gen alive_bl = bday<=interview_date_bl // birthday was before the interview date
gen child_age_bl = (interview_date_bl - bday)/(365/12) if alive_bl==1

* Re-label baseline variables
lab var prop_males0_5_bl "Proportion of males 0-5 years"
lab var prop_males6_18_bl "Proportion of males 6-18 years"
lab var prop_males19_49_bl "Proportion of males 19-49 years"	
lab var prop_males50_bl "Proportion of males 50+ years"
		
lab var prop_females0_5_bl "Proportion of females 0-5 years"
lab var prop_females6_18_bl "Proportion of females 6-18 years"
lab var prop_females19_49_bl "Proportion of females 19-49 years"
lab var prop_females50_bl "Proportion of females 50+ years"

lab var decisiones_bl "Decision-making power"

lab var decisiones_bl_imputed "Decision-making power: Missing"
lab var grit_bl_imputed "Grit: Missing"
lab var rosenberg_bl_imputed "Rosenberg: Missing"
lab var inc_h_bl_imputed "Household income: Missing"
lab var hh_imputed "Household characteristics: Missing"
lab var interview_date_bl "Baseline interview date"

lab var alive_bl "Alive at baseline"
lab var child_age_bl "Age at baseline, m"

* drop variables we don't need
drop momid mom_name child_name mom_personid agemonths fecha_nacimiento sexo fecha_medicion not_matched age_new id_caregiver_new educ_prim_c_new dad_house mom_house id_s16_ma id_s16_ni

distinct communityid // includes all 80; restrict to 76 eligible communities
drop if comunidad_elegible==0
distinct communityid
drop if tratamiento_hogar==. // drop 1 with unknown treatment assignment

save "${processed}/child_target_analysis.dta", replace


******************** CAREGIVER DATA ****************************
use "${el_processed}/caregiver_target_el.dta", clear

rename age cg_age
lab var cg_age "Caregiver age"

rename educ_prim_c cg_educ
lab var cg_educ "Caregiver primary education"

rename male cg_sex
lab var cg_sex "Caregiver sex"

* rename endline variables so baseline can merge
foreach var in decisiones grit rosenberg{
	rename `var' `var'_el
}

* add baseline characteristics
rename folio idunico
gen double idpersona = id_persona

merge 1:1 idunico idpersona using "${bl_processed}/mom.dta", keepusing(idpersona idunico conocimiento decisiones grit rosenberg inc_h wealth prop_males0_5 prop_males6_18 prop_males19_49 prop_males50  prop_females0_5 prop_females6_18 prop_females19_49 prop_females50 preg) // 110 don't match... mostly new households?

tab el_only if _merge==1 // 62 are new households
drop if _merge==2 // drop baseline only
drop _merge

* rename all baseline variables so it's clear
foreach var in decisiones grit rosenberg conocimiento inc_h wealth prop_males0_5 prop_males6_18 prop_males19_49 prop_males50  prop_females0_5 prop_females6_18 prop_females19_49 prop_females50{
	rename `var' `var'_bl
	sum `var'_bl if tratamiento_hogar==3
	local mean = `r(mean)'
	gen `var'_bl_imputed =(`var'_bl==.)
	replace `var'_bl=`mean' if `var'_bl==.
}

rename preg preg_bl 

	* same people missing all household characteristics -- make just one variable for those missing and drop others
	rename wealth_bl_imputed hh_imputed
	drop prop_males0_5_bl_imputed prop_males6_18_bl_imputed prop_males19_49_bl_imputed prop_males50_bl_imputed prop_females0_5_bl_imputed prop_females6_18_bl_imputed prop_females50_bl_imputed
	
	
* change endline back
foreach var in decisiones grit rosenberg{
	rename `var'_el `var'
}

* Re-label baseline variables
lab var prop_males0_5_bl "Proportion of males 0-5 years"
lab var prop_males6_18_bl "Proportion of males 6-18 years"
lab var prop_males19_49_bl "Proportion of males 19-49 years"	
lab var prop_males50_bl "Proportion of males 50+ years"
		
lab var prop_females0_5_bl "Proportion of females 0-5 years"
lab var prop_females6_18_bl "Proportion of females 6-18 years"
lab var prop_females19_49_bl "Proportion of females 19-49 years"
lab var prop_females50_bl "Proportion of females 50+ years"

lab var decisiones_bl "Decision-making power"
lab var conocimiento_bl "Baseline knowledge index"
lab var preg_bl "Pregnant at baseline"

lab var decisiones_bl_imputed "Decision-making power: Missing"
lab var grit_bl_imputed "Grit: Missing"
lab var rosenberg_bl_imputed "Rosenberg: Missing"
lab var conocimiento_bl_imputed "Baseline knowledge: Missing"
lab var inc_h_bl_imputed "Household income: Missing"
lab var hh_imputed "Household characteristics: Missing"

drop momID know_merge mom_test max_child momID_child max_momID min_momID mom_sub know_test dup_child know_complete dup_know targetID target_max target_min

distinct communityid
drop if comunidad_elegible==0
distinct communityid
drop if tratamiento_hogar==. // 1 without treatment assignment

save "${processed}/caregiver_target_analysis.dta", replace