* ==============================================================================
* T E M P L A T E  for  D A T A   C H E C K S  recommended by   A U S S D A   
*
* Study Title: 	
* Date: 		
* Name: 
*
* self-deposit data_checks-template version: 2.0
* This template contains commands that proved useful for datachecks. This by no 
* means indicates that all necessary checks are part of this Do-File.
*
* all elements enclosed by "#" are placeholders and have to be replaced to make the code run
* all "#" in the code have to be deleted
* 
*
* This do-file is licensed under a Creative Commons Attribution 4.0 International License
* (CC BY 4.0, https://creativecommons.org/licenses/by/4.0/)
* Suggested citation: 
* Bodlos, Anita; Heider, Veronika; Bischof, Christian; Butzlaff, Iris; Hirsch, Lisa (2025). 
* Template for Data Checks. Do-File. Vienna: The Austrian Social Science Data Archive.
*
* ==============================================================================

********************************************************************************
* if not already installed, install the following packages to make this do-file run
ssc install fre
ssc install findname
ssc install labellist
ssc install scandata



				*  DATA CHECKS  *
				* ============= *
capture log close 																// close open log files


********************************************************************************
					*** define globals ***
		
global datadir	#"C:\Users\MyName\Myproject\Mydata"#							// path to working directory
global data		#"data_4_publication_v2_wide.dta"#								// name of the dataset

********************************************************************************
					*** Import data ***
********************************************************************************
			
cd "$datadir"																	// change to working directory
dtaversion "${data}" 															// check STATA version; 
* If the data version is older/smaller than version 14, use version 14 (saveold #filename#, version(14) )
* This only works if your Stata version is version 14 or younger. If not, please contact AUSSDA. 

use "${data}", clear															// open data file 
									
********************************************************************************
				*** COMPARE Data & Documentation ***
********************************************************************************

log using "${data}_datachecks", append											// create log file 

*** short descriptive information on the dataset and variables
* check if dataset is labeled, delete the label if not relevant
d, s

*** check if the ID variable is indeed an ID variable
* an ID must uniquely identify observations 
isid #id-var#              														// replace "id-var" with the varname of the ID var

*** check for duplicates
* check if any duplicates are correct or a mistake
duplicates report

*** look for unlabeled values
* all values must be labeled, except for midpoints on scales or numerical variables (e.g. age)
scandata, nolabel 
local varnolabels =  r(mis_lab)
capture codebook `varnolabels', tab(100)										// if no variables with unlabeled values are found, you get an error message ("invalid name")


*** check for systemmissings that will cause problems for other data formats			// if no frequencies appear, stata found no system missings
* if you find .a, .b, ... missings, replace them with numerical values and labels (e.g. 99=don't know)
foreach var of varlist _all {
	capture confirm numeric variable `var'										// check if variable is numeric
	    if !_rc {																// if variable is numeric...
            qui count if missing(`var') & `var' != .  							// ... count if variable is missing, but not "."
				if r(N) > 0 {													// if you find one or more missings that are not "."...
					fre `var' if missing(`var') & `var' != .					// show me the frequencies
				}
         }
}

*** check each single variable and its label
* cross-check with both the questionnaire and the codebook
* the information must be the same!
* please also check for typos and spelling mistakes
* check for any variables that contain sociodemographic information 
* check if there are less than 20 observations per category (re-identication risk!)
foreach var of varlist _all {
	fre `var'
}


*** Modified labels: 															// here is the place to modify labels


********************************************************************************
					*** Anonymization checks ***
********************************************************************************
		
*** Look for string variables  *
******************************
findname, type(string) local(strvars)											// search for string variables and show frequencies of all stringvar
local n_stringvars: word count of `strvars'										// get an overview of string variables
if `n_stringvars' > 1 {															
	codebook `strvars', compact	
	}

foreach var of local strvars{													// have a look at string variables in more detail
	fre `var'
}

			* Room for anonymisation *											// manual recoding of sociodemographic vars


********************************************************************************
					*** Plausibility checks ***
********************************************************************************

tab var1 var2																	// cross-tabbing variables
tab var1 if var2 == 1															// tabbing variables for subgroups
					
********************************************************************************
					*** Add archiving variables and rename dataset ***
********************************************************************************
												
*** ADD DOI and VERSION to the beginning of each dataset**
gen version  = #"1.0 (202#Y#-#MM#-#DD#)"# 										// add year, month, day
label var version "AUSSDA archive version"

gen doi = #"doi:10.11587/######"#												// add doi
label var doi "digital object identifier"
order version doi, first

*** save the dataset under the standard filename pattern "DOIsuffix_da_language_version.dta" 
* if the language of your filename is German, replace "en" with "de"
save #"######_da_en_v1_0.dta"# 													

** Save this as csv for and upload is as well
* if the language of your filename is German, replace "en" with "de"
export delimited using #"######_da_en_v1_0.csv"#, nolabel replace  		// save csv-file

*** close the log file **
log close