***************************************************************
***************************************************************
*
*							ESEMPIO 2
*
****************************************************************
* Le cose si possono complicare ulteriormente quando non abbiamo 
* un ID famiglia. Utilizziamo l'ID di coppia
***************************************************************
* includiamo anno e mese di nascita e il paese 
* ricordare che il mese di nascita ha dei dati mancanti
***************************************************************
clear
cd "G:\Il mio Drive\Multifonte"

use sharew1_rel8-0-0_cv_r

****************************************************************
tab interview 
tab interview if coupleid1!=""

bys coupleid1: gen num=_n if coupleid!=""
tab num
tab num if interview==1


*****************************************************************
*guardiamo l'ID di coppia a cosa si riferisce

gen id=_n
reclink2 yrbirth mobirth gender interview coupleid1 using dn_link,    ///
gen(myscore) idm(id) idu(ID) minscore(.9) wmatch(8 5 3 5 10) wnomatch(5 2 7 7 7)    ///
require(interview coupleid1) npairs(1) upr(_)

tab _merge interview
tab myscore interview

*********************************************************
* consideriamo che nel nostro modello stiamo cercando  
* degli abbinamenti all'interno delle coppie o con coupleid==""
* QUINDI: DOVE SONO PIù PROBABILI DEGLI ABBINAMENTI FALSI?
***********************************************************
tab myscore interview if coupleid==""
tab myscore interview if coupleid!=""


*****************************************************************
* LA maggioranza degli abbinamenti hanno delle probabilità
* di abbinamento =1. Questo significa che possiamo commettere
* errori in abbinamenti che hanno uguale data di nascita, paese,
* sesso, e non hanno un partner.
* GUARDIAMO CHI SONO I NOSTRI POTENZIALI ABBINAMENTI SBAGLIATI
*****************************************************************
bys yrbirth mobirth gender interview coupleid1: gen N=_n 
bys yrbirth mobirth gender interview coupleid1: egen N1=max(N)

tab N1 myscore if coupleid1=="" & interview==1 
tab N1 myscore if coupleid1!="" & interview==1 


**********************************************************************
*indicatore per i possibili ABBINAMENTI FALSI o non- abbinamenti
**********************************************************************
gen nomatch=1 if (_merge==1 & interview==1) | (N1>1 & interview==1 & coupleid1=="")    ////
| (N1==1 & interview==1 & coupleid=="" & myscore<1)

***
tab nomatch myscore
tab nomatch interview,mi
tab N1 nomatch

***
tab  relr nomatch, mi
tab cvresp nomatch
tab partnerinhh nomatch,mi
tab hhsize nomatch


drop if nomatch==1

save step1.dta, replace
*43964- 36444 = 7520 casi da abbinare



********************************************************************************
********************************************************************************

*					 NEL SECONDO STEP:

********************************************************************************
* cerchiamo di abbinare dei casi che non siamo riusciti
* (o non siamo sicuri) di aver abbinato bene nel primo STEP
* tra questi ci saranno dei FALSI ABBINAMENTI ma anche degli
* ABBINAMENTI VERI
********************************************************************************
clear

************************************************************
*creo una variabile sul numero di componenti della famiglia
*non siamo certi che questa variabile sia identica in entrambi
* i dataset (dipende dal numero di record per famiglia)
***********************************************************
use dn_link
drop num 
bys interview hhid: gen num=_n
bys interview hhid: egen num1=max(num)
tab num1 if coupleid==""
save dn_link,replace

**********************************************************
 clear
 use sharew1_rel8-0-0_cv_r
bys interview hhid: gen num=_n
bys interview hhid: egen num1=max(num)
tab num1

 gen id=_n
reclink2 gender yrbirth mobirth country num1 interview using dn_link,    ///
gen(myscore2) idm(id) idu(ID) minscore(.8) wmatch(3 8 5 5 3 10) wnomatch(9 7 2 7 5 10) _merge(_merge2)                        ///
require(country interview) npairs(1) upr(_) orblock(country) exclude(step1)


tab _merge2
tab _merge2 myscore


*problema 3 stiamo includendo anche gente che ha un partner
* -> quando usiamo *esclude* vogliamo abbinare solo coloro
* che non abbiamo abbinato nello step 1 
 tab _merge2 if coupleid!=""
 tab _merge2 if coupleid==""


 
 
***************************************************************************
 *************************************************************************
 
 * 			AGGIUSTARE LO STEP 2: includiamo ID coppia

 **************************************************************************
 clear
use sharew1_rel8-0-0_cv_r, clear

*creo una variabile sul numero di componenti della famiglia
bys interview hhid: gen num=_n
bys interview hhid: egen num1=max(num)

 gen id=_n
reclink2 gender yrbirth mobirth country num1 interview coupleid1 using dn_link,    ///
gen(myscore2) idm(id) idu(ID) minscore(.8) wmatch(3 8 5 5 3 10 10) wnomatch(9 7 2 7 5 10 10) _merge(_merge2)                        ///
require(coupleid1 interview) npairs(1) upr(_) orblock(country) exclude(step1)
 
 
 tab _merge2 myscore
 *possiamo assumere che gli abbinamenti sotto 1 siano falsi
 * ma tra coloro che hanno score=1 chi sono i VERI?

 bys yrbirth mobirth gender interview country coupleid1  num1: gen N=_n 
bys yrbirth mobirth gender interview country coupleid1 num1: egen N1=max(N)
 tab N1 myscore if _merge2==3 
* possiamo essere certi che coloro che hanno score=1 e non hanno duplicati
* nella matrice sono ABBINAMENTI VERI 



 
 
 
  ***************************************************************
 * AGGIUSTARE LO STEP 2: includiamo il linguaggio
 ***************************************************************
  clear
use sharew1_rel8-0-0_cv_r, clear


 gen id=_n
reclink2 gender yrbirth mobirth country interview coupleid1 language using dn_link,    ///
gen(myscore2) idm(id) idu(ID) minscore(.75) wmatch(3 8 5 7 10 10 10) wnomatch(8 7 2 8 10 10 4) _merge(_merge2)                        ///
require(coupleid1 interview) npairs(1) upr(_) orblock(country) exclude(step1)
 

 
 bys yrbirth mobirth gender interview country coupleid1 language: gen N=_n 
bys yrbirth mobirth gender interview country coupleid1 language: egen N1=max(N)
 tab N1 myscore if _merge2==3 

 
 
 
 ********************************************************
 * Aggiungiamo i nostri abbinamenti nel nostro STEP 1
 * con una prob. di abbinamento = 1 e senza record con 
 * uguali caratteristiche
 ********************************************************
 drop if N1>1 | myscore2<0.94999 | _merge2!=3
 tab gender
 
 append using step1.dta, gen(step)
 tab step
recode step(0=2)
tab step interview

* ricordarsi sempre n da abbinare
* 7520 - 4743 = 2777
 save step1,replace
 
 
 ****************************************************************
 *
 *			STEP 3: cerchiamo altri indicatori comuni
 *
 ****************************************************************
use dn_link.dta, clear
 
 *se si tratta di nuclei con un solo componente, il rispondente 
 * sarà l'unico individuo del nucleo familiare
 tab dn038_ if coupleid==""
 gen pr=1 if dn038_==1 & coupleid==""
 tab mobirth if coupleid==""
 
 save dn_link, replace
 
 
 ***********************************************************
   clear
use sharew1_rel8-0-0_cv_r.dta

*indicatore del rispondente
tab relrper if coupleid1=="",mi
gen pr=1 if relrper==. & coupleid1==""

*ricodifica il mese di nascita
fre mobirth if coupleid1=="" & interview==1
recode mobirth(-2=-1)

*numero componenti
bys interview hhid: gen num=_n
bys interview hhid: egen num1=max(num)


 gen id=_n
 reclink2 gender yrbirth mobirth country num1 interview coupleid1 language pr using dn_link,    ///
gen(myscore2) idm(id) idu(ID) minscore(.8) wmatch(3 15 7 5 4 5 5 6 5) wnomatch(8 7 2 5 5 10 10 4 4) _merge(_merge2)                        ///
require(coupleid1 interview) npairs(1) upr(_) orblock(country) exclude(step1)


 tab myscore 
 
 *quanti individui hanno le stesse caratteristiche e hanno score=1 -> match veri
 bys gender yrbirth mobirth country num1 interview coupleid1 language pr: gen dup=_n
 bys gender yrbirth mobirth country num1 interview coupleid1 language pr: egen dup1=max(dup)
 
 tab dup1 myscore if dup1<5 & myscore>0.85
 
 *review manuale
 sort gender yrbirth mobirth country num1 interview coupleid1 language pr
 br *gender *yrbirth *mobirth *country *num1 *language *pr myscore if interview==1 & myscore<1 & dup1==1

 *la maggioranza di questi match hanno non-corrispondenze nell'anno di nascita
 tab _merge2 if interview==1 & myscore<1 & dup1==1 & mobirth!=_mobirth
 
 drop if _merge2!=3 | dup1>1 | (dup1==1 & myscore2<0.899999)
 
 
  append using step1.dta, gen(step2)

tab step step2,mi
* 2777-589 = 2188