* ==============================================================================
* T E M P L A T E  for  D A T A   C H E C K S  recommended by   A U S S D A   
*
* Study Title: 	
* Date: 		
* Name: 
*
* self-deposit data_checks-template version: 1.0
* This template contains commands that proved useful for datachecks. This by no 
* means indicates that all necessary checks are part of this Do-File.
*
* *all elements enclosed by "#" have to be filled in to make the code run
* *all "#" in the code have to be deleted
* 
*
* This do-file is licensed under a Creative Commons Attribution 4.0 International License
* (CC BY 4.0, https://creativecommons.org/licenses/by/4.0/)
* Suggested citation: 
* Bodlos, Anita; Heider, Veronika; Bischof, Christian; Butzlaff, Iris (2020). 
* Template for Data Checks. Do-File. Vienna: The Austrian Social Science Data Archive.
*
* ==============================================================================

********************************************************************************
* if not already installed, install the following packages to make this do-file run
ssc install fre
ssc install findname
ssc install labellist
ssc install scandata



				*  DATA CHECKS  *
				* ============= *
capture log close 																// close open log files


********************************************************************************
					*** define globals ***
		
global datadir	#"C:\Users\MyName\Myproject\Mydata"#							// path to working directory
global data		#"data_4_publication_v2_wide"#									// name of the dataset; do NOT add ".dta" to the filename

********************************************************************************
					*** Import data ***
			
cd "$datadir"																	// change to working directory
dtaversion "${data}.dta" 														// check STATA version, AUSSDA recommends version 14 (saveold #filename#, version(14) )
use "${data}.dta", clear														// open data file 
									
********************************************************************************
				*** COMPARE Data & Documentation ***

log using "${data}_datachecks" 													// create log file (evtl. using the "replace" option)

*** short descriptive information on the dataset and variables
d, s

*** check if the ID variable is indeed an ID variable
isid #id-var#              														// replace "id-var" with the varname of the ID var

*** look for unlabeled values
scandata, nolabel 
local varnolabels =  r(mis_lab)
capture codebook `varnolabels', tab(100)										// if no variables with unlabeled values are found, you get an error message ("invalid name")


*** check for systemmissings that may cause problems for other data formats		// if no frequencies appear, stata found no system missings
foreach var of varlist _all {
	capture confirm numeric variable `var'										// check if variable is numeric
	    if !_rc {																// if variable is numeric...
            qui count if missing(`var') & `var' != .  							// ... count if variable is missing, but not "."
				if r(N) > 0 {													// if you find one or more missings that are not "."...
					fre `var' if missing(`var') & `var' != .					// show me the frequencies
				}
         }
}

*** check all variables and their labels
foreach var of varlist _all {
	fre `var'
}


********************************************************************************
*** check value labels for spelling errors and special characteristcs: 
*** Warning: Dataset will be cleared!
*** Warning: all changes to labels with "label define" are lost if made before "restore".
preserve																		// saves the current version of the dataset

*** create labelbook for checks
uselabel, clear																	// creates a dataset that contains all label names and value labels

*** check for leading or trailing blanks
gen trimcheck = strtrim(label)													// generate variable that contains the trimmed version of the variable "label"
replace trimcheck = stritrim(trimcheck)											// remove embedded blanks from trimcheck
tostring value, gen(value_str)													
gen Code = "label define " + lname + " " + value_str + `" ""' + trimcheck + `"" , modify"'   // create code for correcting non-trimmed labels (needs to be copied to the section ("modified labels")
list  if trimcheck != label	, sep(0) 											// have a look on labels that have been trimmed and code for modifying labels
drop trimcheck value_str Code 													// delete variables after having corrected potential leading or trailing blanks

*** check for special characters
gen check_specialChar =  label													// duplicate the variable "label" for further curation
replace check_specialChar =  ustrregexra(check_specialChar, "[a-zA-Z0-9]","") 	// delete letters a-z & all digits
replace check_specialChar =  ustrregexra(check_specialChar, uchar(39), "") 		// Apostrophe
replace check_specialChar =  ustrregexra(check_specialChar, uchar(44), "") 		// comma
replace check_specialChar =  ustrregexra(check_specialChar,  "\.", "") 			// full stop
replace check_specialChar =  ustrregexra(check_specialChar, uchar(47), "") 		// Slash
replace check_specialChar =  ustrregexra(check_specialChar, uchar(45), "") 		// Hyphen-minus
replace check_specialChar =  strtrim(check_specialChar)

fre check_specialChar															// have a look on variable labels that potentially cause problems in other data formats (e.g. "€")
list lname value label check_specialChar if check_specialChar != ""
drop check_specialChar

*** if you would like to perform a spell check in Excel 
gen spellchecked = label														// create variable for spellcheck
order spellchecked, after(label)
drop trunc
gen Code = `"="label define "&A2&" "&B2&" "&ZEICHEN(34)&D2&ZEICHEN(34)&" , modify""' in 1  // by doubleclicking on the code in Excel, Excel should convert the strings to a formular that you can then copy to all other cells
gen TypoFound = `"=wenn(C2=D2;"";"TypoFound")"' in 1 							// formular can be copied to all other cells; the formular compares the original label to the one with the corrected typo and shows "TypoFound" in case you made any correctionin so that you can find the changes more easily
export excel using "${data}_spellcheck.xlsx", firstrow(var)						// use option "replace" if you would like to overwrite an existing file
*erase "${data}_spellcheck.xlsx"												// if you would like to delete the spellcheck file

restore

*** Modified labels: 															// here is the place to modify labels


********************************************************************************
					*** Anonymization checks ***
		
*** Look for string variables  *
******************************
findname, type(string) local(strvars)											// search for string variables and show frequencies of all stringvar
local n_stringvars: word count of `strvars'										// get an overview of string variables
if `n_stringvars' > 1 {															
	codebook `strvars', compact	
	}
foreach var of local strvars{													// have a look on string variables in more detail
	fre `var'
}

			* Further Anonymisation checks *									// perform manual anonymisation checks




********************************************************************************
					*** Plausibility checks ***



					
********************************************************************************
					*** Add archiving variables and rename dataste ***
												
*** ADD DOI and VERSION to the beginning of each dataset**
gen version  = #"1.0 (202#Y#-#MM#-#DD#)"# 										// add year, month, day always use small letter in variable name
label var version "AUSSDA archive version"

gen doi = #"doi:10.11587/######"#												// add doi
label var doi "digital object identifier"
order version doi, first

*** save the dataset under the standard filename pattern "DOIsuffix_da_language_version.dta" 
saveold #"######_da_en_v1_0.dta"# , version(14)									// if the language of your filename is German, replace "en" with "de"

*** close the log file **
log close