RTransposer: ejemplo de uso (1)

It shows how to use [R] (https://www.r-roject.org/) and the package ** RTransposer ** to import the example (1) supplied by Alter in the github repositiry “Transposer”

Preparacion de los datos fuente.

Download the data from the github repository: ICPSR / IDS corresponding to example-1 (UG_example_1) and unzip the book excell UG_Example_1_v10.xlsx

library(RTransposer)
require(data.table) 
#> Loading required package: data.table
require(readxl)
#> Loading required package: readxl
rm(list = ls())

tmpLoc <- tempdir() ; print(tmpLoc)
#> [1] "/tmp/RtmpoMY5YO"
## Descarga y descomprensión
zipfile <-  paste0(tmpLoc,'/UG_example_1.zip')

download.file('https://github.com/ICPSR/IDS/raw/master/User_Guide/UG_example_1.zip', 
              destfile = zipfile )
unzip(zipfile,list = T)
#>                    Name Length                Date
#> 1    CHILDREN_INPUT.csv  30028 2017-02-15 15:43:00
#> 2            entity.csv   3470 2017-02-25 18:52:00
#> 3     PARENTS_INPUT.csv  24327 2017-02-15 15:42:00
#> 4      PLACES_INPUT.csv   4410 2017-02-15 14:46:00
#> 5            ReadMe.txt    603 2017-02-25 19:01:00
#> 6      relationship.csv   1907 2017-02-25 18:52:00
#> 7 UG_Example_1_v10.xlsx 417114 2017-02-25 19:00:00

unzip(zipfile,files = 'UG_Example_1_v10.xlsx',  exdir =  tmpLoc )
xfile <- paste0(tmpLoc,'/UG_Example_1_v10.xlsx')
file.remove(zipfile)
#> [1] TRUE

We extract the data from the data sheets * parent * and * children * and store them in an object of type * data.table *. Next we create some auxiliary tables derived from the two original source tables:


as.data.table(read_xlsx(xfile, sheet = 'parents'))  -> parents
as.data.table(read_xlsx(xfile, sheet = 'children')) -> children

#------------------------------------
# Create INPUT-tables derived from primary tables (if necessary)
#------------------------------------
parents[,.(parid,hlast,marriage)][ 
   children[,
            .(name,birth, birthloc, kidid= parid*1000+1:.N, 
                                    momid=parid+2,
                                    dadid=parid+1),
            keyby=.(parid)], on='parid'] -> children_input

names.loc  <- names(parents)[grep("loc",names(parents))]
names.loc  <- paste0('c(', paste(names.loc, collapse = ', '),', children$birthloc )')
places <- parents[, sort(unique(eval(parse(text=names.loc))))]

places_input<- data.table( 
  places = places,
  placeid = 1L:length(places)+100L,
  level  =  "Neighborhood",
  nested = 100L,
  relvar = "Neighborhood and Municipality"
  )
# append tow end register
places_input <-   rbind(places_input, 
      data.table(  places = c("California","Los Angeles"),
                                 placeid = c(1L,100L),
                                 level  =  c("State","Municipality"),
                                 nested = c(NA,1L),
                                 relvar = c("","Municipality and State")))

Si es necesario se crea funciones auxiliares para la carga, que transformaran los datos originales almacenado en las tablas fuentes:

###-------------------------------------------------------------------------
###  auxiliary functions for ValueExpresion
###-------------------------------------------------------------------------
#' MunicipalityName2Code
#'    Entra un vector con nombres de municipios y retorna codigos numericos 
#'    contenido en la tabla 
#'    @param mun.names vector con nombre
#'    @tab.codes  data.table with names and codes
MunicipalityName2Code <- function ( mun.names, tab.codes = places_input, 
                                    geo.names='places' ,geo.codes ='placeid' ) {
   tab.codes[[geo.codes]] -> codes
   names(codes) <- tab.codes[[geo.names]]
   return(codes[mun.names])
}

Especificaciones de las regla de importanción

Las transformaciones que se aplicaran sobre las variables de los fichero fuentes se define en un libro excell con una estructura especifica (Se incluyen un ejemplo de este libro en el directorio inst/extdata/ del paquete) para optener los registros que compone el IDS.

Generación de la BD en formato IDS

A continuación se aplica la función TRtransposer::tranposer, con el parámetro el nombre del fichero excell con las distintas transformaciones a aplicar sobre las tablas de datos, previamente cargadas en el ambiente global de R (*.GlobalEnv).

##--------------------------------------------------------------------------
## Transposer with rules to import in  EntityRelationDefinition01.xlsx
transposer(file.definition = system.file("extdata", 
                                        "ERD_UG_Example_1_v10.xlsx", 
                                        package = "RTransposer") ) -> outcomes

La ejecución del código anterior produce una lista con 5 objeto del tipo data.table y con la nomenclatura y estructura especificado en el formato IDS.

A partir de este objeto se pueden obtener las 5 tablas en fichero de texto del tipo CSV, que pueden ser facilmente leídos por programas de base de dato o análisis estadístico.

Las siguientes ordenes realizan esta exportación

### Muestra resumen estadistico de outcomes::
print(paste('== INDIVIDUAL == ')) ;  print(outcomes$INDIVIDUAL[,.N,keyby=Type])
#> [1] "== INDIVIDUAL == "
#>                  Type   N
#>  1:        BIRTH_DATE 565
#>  2:    BIRTH_LOCATION 565
#>  3:        DEATH_DATE 200
#>  4:    DEATH_LOCATION 200
#>  5:        FIRST_NAME 565
#>  6:         LAST_NAME 565
#>  7:     MARRIAGE_DATE 200
#>  8: MARRIAGE_LOCATION 200
#>  9:        OCCUPATION 200
#> 10:               SEX 200
print(paste('== CONTEXT == ')); print(outcomes$CONTEXT[,.N,keyby=Type])
#> [1] "== CONTEXT == "
#>     Type   N
#> 1: LEVEL 174
#> 2:  NAME  74
print(paste('== INDIV_INDIV == ')); print(outcomes$INDIV_INDIV[,.N,keyby=Relation])
#> [1] "== INDIV_INDIV == "
#>    Relation   N
#> 1:    Child 730
#> 2:   Father 365
#> 3:  Husband 100
#> 4:   Mother 365
#> 5:     Wife 100
print(paste('==  INDIV_CONTEXT == ')); print(outcomes$INDIV_CONTEXT[,.N,keyby=Relation])
#> [1] "==  INDIV_CONTEXT == "
#>    Relation   N
#> 1:    Child 365
#> 2:  Husband 100
#> 3:     Wife 100
print(paste('==  CONTEXT_CONTEXT == ')); print(outcomes$CONTEXT_CONTEXT[,.N,keyby=Relation])
#> [1] "==  CONTEXT_CONTEXT == "
#>                         Relation   N
#> 1:                                 1
#> 2:        Municipality and State   1
#> 3: Neighborhood and Municipality  72
#> 4:        Union and Neighborhood 665

## Almacena resultados en tablas CSV con resultados
for (i in  names(outcomes)) {
  write.csv(outcomes[[i]],file=paste0(tmpLoc,'/',i,'.CSV'),na='',row.names=FALSE)
}
print(tmpLoc)
#> [1] "/tmp/RtmpoMY5YO"
dir(path = tmpLoc,pattern = '*.CSV')
#> [1] "CONTEXT_CONTEXT.CSV" "CONTEXT.CSV"         "INDIV_CONTEXT.CSV"  
#> [4] "INDIV_INDIV.CSV"     "INDIVIDUAL.CSV"

Borramos los ficheros de prueba …


file.remove(paste0(tmpLoc,'/',names(outcomes),'.CSV'))
file.remove(paste0(tmpLoc,'/','UG_Example_1_v10.xlsx'))

Viciana, F; …

2019-02-06

Preparacion de los datos fuente.

Especificaciones de las regla de importanción

Generación de la BD en formato IDS

Contents