RTransposer: example of usage (2)

It shows how to use R and RTransposer package to import the example (2) provided by Alter in the “Transposer” package repository

Source file preparation

Downloading the data from github repository: ICPSR/IDS corresponding to example 2 (UG_example_2) and uncompress the excel file UG_Example_2_v10.xlsx

library(RTransposer)
require(data.table) 
#> Loading required package: data.table
require(readxl)     
#> Loading required package: readxl
rm(list = ls())     

tmpLoc <- tempdir() ; print(tmpLoc)  # creation of a temporary folder for files used and produced in this script
#> [1] "/tmp/RtmpEjRFjv"
## Download and uncompress
zipfile <-  paste0(tmpLoc,'/UG_example_2.zip')

download.file('https://github.com/ICPSR/IDS/raw/master/User_Guide/UG_example_2.zip', 
              destfile = zipfile )
unzip(zipfile,list = T)  
#>                    Name Length                Date
#> 1      births_input.csv  33969 2014-12-31 06:47:00
#> 2      deaths_input.csv  10272 2014-12-31 07:03:00
#> 3        entity_ex2.csv   2708 2017-02-25 19:17:00
#> 4   marriages_input.csv   9686 2014-12-31 06:46:00
#> 5      places_input.csv   4337 2014-12-31 10:49:00
#> 6            ReadMe.txt    634 2017-02-25 19:18:00
#> 7  relationship_ex2.csv   1436 2017-02-25 19:14:00
#> 8 UG_Example_2_v10.xlsx 455735 2017-02-25 19:17:00

unzip(zipfile,files = 'UG_Example_2_v10.xlsx',  exdir =  tmpLoc )  
xfile <- paste0(tmpLoc,'/UG_Example_2_v10.xlsx')
file.remove(zipfile)
#> [1] TRUE

Data exrtraction from parent and children sheets, storage in a data.table type object and creation of some auxiliary/imput tables derived from these two source tables:


as.data.table(read_xlsx(xfile, sheet = 'parents'))  -> parents
as.data.table(read_xlsx(xfile, sheet = 'children')) -> children

#------------------------------------
# Create INPUT tables derived from primaries (if necessary)  # in this case are 4, by sources (births, deaths, marriages) plus places
#------------------------------------
# places
names.loc  <- names(parents)[grep("loc",names(parents))]
names.loc  <- paste0('c(', paste(names.loc, collapse = ', '),', children$birthloc )')
places <- parents[, sort(unique(eval(parse(text=names.loc))))]

places_input<- data.table(
  place  = places,
  placeid = 1L:length(places)+100L,
  level  =  "Neighborhood",
  nested = 100L,
  relvar = "Neighborhood and Municipality"
)
  # append tow end register
places_input <-   rbind(places_input,
                        data.table(  place = c("California","Los Angeles"),
                                     placeid = c(1L,100L),
                                     level  =  c("State","Municipality"),
                                     nested = c(NA,1L),
                                     relvar = c("","Municipality and State")))

# marriages
parents[,.(parid,hfirst,hlast,wfirst,wlast,hocc,wocc,hid=parid*10+1,wid=parid*10+2,marriage,marrloc)]  -> marriages_input

# deaths
rbind(parents[,.(idi=parid*10+1,fname=hfirst,lname=hlast,death=hdeath,deathloc=hdeathloc)],
      parents[,.(idi=parid*10+2,fname=wfirst,lname=wlast,death=wdeath,deathloc=wdeathloc)]) -> deaths_input

# births
rbind(parents[,.(kidid=parid*10+1,fname=hfirst,lname=hlast,dadid=NA,momid=NA,birth=hbirth,birthloc=hbirthloc)],
      parents[,.(kidid=parid*10+2,fname=wfirst,lname=wlast,dadid=NA,momid=NA,birth=wbirth,birthloc=wbirthloc)],
      parents[,.(parid,lname=hlast)][
        children[,
                 .(fname=name,birth, birthloc, kidid= parid*1000+1:.N,
                   momid=parid*10+2,
                   dadid=parid*10+1),
                 keyby=.(parid)], on='parid'][,.(kidid,fname,lname,dadid,momid,birth,birthloc)]) -> births_input

If necessary, functions should be created to aid with the input, by transforming the original data stored in the source table; in this case it is created a function to convert geographic names (strings) into codes (integers):

###-------------------------------------------------------------------------
###  auxiliary functions for ValueExpresion
###-------------------------------------------------------------------------
#' MunicipalityName2Code
#'    Input of a vector of municipality names and output of numeric codes
#'    contained in the table 
#'    @param mun.names vector con nombre
#'    @tab.codes  data.table with names and codes
MunicipalityName2Code <- function ( mun.names, tab.codes = places_input, 
                                    geo.names='places' ,geo.codes ='placeid' ) {
   tab.codes[[geo.codes]] -> codes
   names(codes) <- tab.codes[[geo.names]]
   return(codes[mun.names])
}

Specifications of the import rules

The transformations applied to the variables in the source files are defined in a excel book with a specific structure (an example of this book can be found in the package’s inst/extdata/ folder) to obtain the records that compose the IDS.

Generating the database in IDS format

The TRtransposer::tranposer is applied, having as parameter the name of the excel file with the diferent transformations to be applied over the data tables, previously loaded in R’s global enviroment (*.GlobalEnv).

##--------------------------------------------------------------------------
## Transposer with rules to import in  EntityRelationDefinition01.xlsx
transposer(file.definition = system.file("extdata", 
                                        "ERD_UG_Example_2_v10.xlsx", 
                                        package = "RTransposer") ) -> outcomes

By running the previous code a list is produced containing 5 data.table type objects with the IDS specific names and structure.

From this list it is possible to get the 5 tables in .CSV file, that can be easily read by statistical analysis base softwares.

The following exports these tables.

### Statistical summary of outcomes::
print(paste('== INDIVIDUAL == ')) ;  print(outcomes$INDIVIDUAL[,.N,keyby=Type])
#> [1] "== INDIVIDUAL == "
#>                  Type   N
#>  1:        BIRTH_DATE 565
#>  2:    BIRTH_LOCATION 565
#>  3:        DEATH_DATE 200
#>  4:    DEATH_LOCATION 200
#>  5:        FIRST_NAME 965
#>  6:         LAST_NAME 965
#>  7:     MARRIAGE_DATE 200
#>  8: MARRIAGE_LOCATION 200
#>  9:        OCCUPATION 200
#> 10:               SEX 200
print(paste('== CONTEXT == ')); print(outcomes$CONTEXT[,.N,keyby=Type])
#> [1] "== CONTEXT == "
#>     Type  N
#> 1: LEVEL 74
#> 2:  NAME 74
print(paste('== INDIV_INDIV == ')); print(outcomes$INDIV_INDIV[,.N,keyby=Relation])
#> [1] "== INDIV_INDIV == "
#>    Relation   N
#> 1:    Child 730
#> 2:   Father 365
#> 3:  Husband 100
#> 4:   Mother 365
#> 5:     Wife 100
print(paste('==  INDIV_CONTEXT == ')); print(outcomes$INDIV_CONTEXT[,.N,keyby=Relation])
#> [1] "==  INDIV_CONTEXT == "
#>    Relation   N
#> 1:     <NA> 965
print(paste('==  CONTEXT_CONTEXT == ')); print(outcomes$CONTEXT_CONTEXT[,.N,keyby=Relation])
#> [1] "==  CONTEXT_CONTEXT == "
#>                         Relation  N
#> 1:        Municipality and State  1
#> 2: Neighborhood and Municipality 72

## Storages the outcomes in CSV tables
for (i in  names(outcomes)) {
  write.csv(outcomes[[i]],file=paste0(tmpLoc,'/',i,'.CSV'),na='',row.names=FALSE)
}
print(tmpLoc)
#> [1] "/tmp/RtmpEjRFjv"
dir(path = tmpLoc,pattern = '*.CSV')
#> [1] "CONTEXT_CONTEXT.CSV" "CONTEXT.CSV"         "INDIV_CONTEXT.CSV"  
#> [4] "INDIV_INDIV.CSV"     "INDIVIDUAL.CSV"

Eliminates used temporary files


file.remove(paste0(tmpLoc,'/',names(outcomes),'.CSV'))
file.remove(paste0(tmpLoc,'/','UG_Example_2_v10.xlsx'))

Viciana, F; Paiva,D

2019-02-06

Source file preparation

Specifications of the import rules

Generating the database in IDS format

Contents