--- title: "How to use RSDA 3.3" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Instroduction to RSDA packages} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 6, fig.height = 5 ) library(RSDA) ``` ## RSDA Package version 3.3 ## Oldemar Rodríguez R. ## Installing the package ### CRAN ```{r, eval=F} install.packages("RSDA", dependencies=TRUE) ``` ### Github ```{r, eval=F} devtools::install_github("PROMiDAT/RSDA") ``` ## How to read a Symbolic Table from a CSV file with RSDA? ```{r} ex3 <- read.sym.table(file = 'tsym1.csv', header=TRUE, sep=';',dec='.', row.names=1) ex3 ``` ##How to save a Symbolic Table in a CSV file with RSDA? ```{r, eval=F} write.sym.table(ex3, file = 'tsymtemp.csv', sep = ';',dec = '.', row.names = TRUE, col.names = TRUE) ``` ## Symbolic Data Frame Example in RSDA ```{r} data(example3) example3 ``` ```{r} example3[2,] example3[,3] example3[2:3,5] example3$F1 ``` ## How to generated a symbolic data table from a classic data table in RSDA? ```{r} data(ex1_db2so) ex1_db2so ``` The `classic.to.sym` function allows to convert a traditional table into a symbolic one, to this we must indicate the following parameters. + `x` = a data.frame + `concept` = variables to be used as a concept + `variables` = variables to be used, conceptible with tidyselect options + `default.numeric` = function that will be used by default for numerical values (sym.interval) + `default.categorical` = functions to be used by default for categorical values (sym.model) #### Example 1 ```{r} result <- classic.to.sym(x = ex1_db2so, concept = c(state, sex), variables = c(county, group, age)) result ``` We can add new variables indicating the type we want them to be. ```{r} result <- classic.to.sym(x = ex1_db2so, concept = c("state", "sex"), variables = c(county, group, age), age_hist = sym.histogram(age, breaks = pretty(ex1_db2so$age, 5))) result ``` #### Example 2 ```{r} data(USCrime) head(USCrime) ``` ```{r} result <- classic.to.sym(x = USCrime, concept = state, variables= c(NumInShelters, NumImmig, ViolentCrimesPerPop), ViolentCrimesPerPop_hist = sym.histogram(ViolentCrimesPerPop, breaks = pretty(USCrime$ViolentCrimesPerPop,5))) result ``` #### Example 3 ```{r} data("ex_mcfa1") head(ex_mcfa1) ``` ```{r} sym.table <- classic.to.sym(x = ex_mcfa1, concept = suspect, variables=c(hair, eyes, region), default.categorical = sym.set) sym.table ``` ### Example 4 We can modify the function that will be applied by default to the categorical variables ```{r} sym.table <- classic.to.sym(x = ex_mcfa1, concept = suspect, default.categorical = sym.set) sym.table ``` ### Converting a SODAS 1.0 *.SDS files to RSDA files ```{r} hani3101 <- SDS.to.RSDA(file.path = "hani3101.sds") hani3101 ``` ```{r, eval=F} # We can save the file in CSV to RSDA format as follows: write.sym.table(hani3101, file='hani3101.csv', sep=';', dec='.', row.names=TRUE, col.names=TRUE) ``` ### Converting a SODAS 2.0 *.XML files to RSDA files ```{r} abalone <- SODAS.to.RSDA("abalone.xml") abalone ``` ```{r, eval=F} write.sym.table(abalone, file='abalone.csv', sep=';', dec='.', row.names = TRUE, col.names = TRUE) ``` ### Basic statistics #### Symbolic Mean ```{r} data(example3) mean(example3$F1) mean(example3[,1]) ``` ```{r} mean(example3$F2) mean(example3[,2]) ``` ```{r} mean(example3$F2,method = "interval") mean(example3[,2],method = "interval") ``` #### Symbolic median ```{r} median(example3$F1) median(example3[,1]) ``` ```{r} median(example3$F2) median(example3[,2]) ``` ```{r} median(example3$F6, method = 'interval') median(example3[,6], method = 'interval') ``` #### Variance and standard deviation ```{r} var(example3[,1]) var(example3[,2]) var(example3$F6) var(example3$F6, method = 'interval') var(example3$F6, method = 'billard') sd(example3$F1) sd(example3$F2) sd(example3$F6) sd(example3$F6, method = 'interval') sd(example3$F6, method = 'billard') ``` ### Symbolic correlation ```{r} cor(example3$F1, example3$F4) cor(example3[,1], example3[,4]) cor(example3$F2, example3$F6, method = 'centers') cor(example3$F2, example3$F6, method = 'billard') ``` ### Radar plot for intervals ```{r} library(ggpolypath) data(oils) oils <- RSDA:::to.v3(RSDA:::to.v2(oils)) sym.radar.plot(oils[2:3,]) sym.radar.plot(oils[2:5,]) res <- interval.histogram.plot(oils[,2], n.bins = 4, col = c(2,3,4,5)) res res <- interval.histogram.plot(oils[,3], n.bins = 3, main = "Histogram", col = c(2, 3, 4)) res ``` ### Distances for intervals #### Gowda-Diday ```{r} data("oils") DM <- sym.dist.interval(sym.data = oils[,1:4], method = "Gowda.Diday") model <- hclust(DM) plot(model, hang = -1) ``` #### Ichino ```{r} DM <- sym.dist.interval(sym.data= oils[,1:4], method = "Ichino") model <- hclust(DM) plot(model, hang = -1) ``` #### Hausdorff ```{r} DM <- sym.dist.interval(sym.data = oils[,c(1,2,4)], gamma = 0.5, method = "Hausdorff", normalize = FALSE, SpanNormalize = TRUE, euclidea = TRUE, q = 2) model <- hclust(DM) plot(model, hang = -1) ``` ### Linear regression for intervals #### Training ```{r} data(int_prost_train) data(int_prost_test) res.cm <- sym.lm(formula = lpsa~., sym.data = int_prost_train, method = 'cm') res.cm ``` #### Prediction ```{r} pred.cm <- sym.predict(model = res.cm, new.sym.data = int_prost_test) ``` #### Testing ```{r} RMSE.L(int_prost_test$lpsa, pred.cm$Fitted) RMSE.U(int_prost_test$lpsa, pred.cm$Fitted) R2.L(int_prost_test$lpsa, pred.cm$Fitted) R2.U(int_prost_test$lpsa, pred.cm$Fitted) deter.coefficient(int_prost_test$lpsa, pred.cm$Fitted) ``` ### LASSO regression for intervals ```{r} data(int_prost_train) data(int_prost_test) ``` #### Training ```{r} res.cm.lasso <- sym.glm(sym.data = int_prost_train, response = 9, method = 'cm', alpha = 1, nfolds = 10, grouped = TRUE) ``` #### Prediction ```{r} pred.cm.lasso <- sym.predict(res.cm.lasso, response = 9, int_prost_test, method = 'cm') ``` #### Testing ```{r} plot(res.cm.lasso) plot(res.cm.lasso$glmnet.fit, "lambda", label=TRUE) ``` ```{r} RMSE.L(int_prost_test$lpsa,pred.cm.lasso) RMSE.U(int_prost_test$lpsa,pred.cm.lasso) R2.L(int_prost_test$lpsa,pred.cm.lasso) R2.U(int_prost_test$lpsa,pred.cm.lasso) deter.coefficient(int_prost_test$lpsa, pred.cm.lasso) ``` ### RIDGE regression for intervals #### Training ```{r} data(int_prost_train) data(int_prost_test) res.cm.ridge <- sym.glm(sym.data = int_prost_train, response = 9, method = 'cm', alpha = 0, nfolds = 10, grouped = TRUE) ``` #### Prediction ```{r} pred.cm.ridge <- sym.predict(res.cm.ridge, response = 9, int_prost_test, method = 'cm') ``` #### Testing ```{r} plot(res.cm.ridge) plot(res.cm.ridge$glmnet.fit, "lambda", label=TRUE) RMSE.L(int_prost_test$lpsa, pred.cm.ridge) RMSE.U(int_prost_test$lpsa, pred.cm.ridge) R2.L(int_prost_test$lpsa, pred.cm.ridge) R2.U(int_prost_test$lpsa, pred.cm.ridge) deter.coefficient(int_prost_test$lpsa, pred.cm.ridge) ``` ### PCA for intervals #### Example 1 ```{r} data("oils") res <- sym.pca(oils,'centers') plot(res, choix = "ind") plot(res, choix = "var") ``` #### Example 2 ```{r} res <- sym.pca(oils,'tops') plot(res, choix = "ind") ``` #### Example 3 ```{r} res <- sym.pca(oils, 'principal.curves') plot(res, choix = "ind") ``` #### Example 4 ```{r} res <- sym.pca(oils,'optimized.distance') plot(res, choix = "ind") plot(res, choix = "var") ``` #### Example 5 ```{r} res <- sym.pca(oils,'optimized.variance') plot(res, choix = "ind") plot(res, choix = "var") ``` ### Symbolic Multiple Correspondence Analysis #### Example 1 ```{r} data("ex_mcfa1") ex_mcfa1 ``` ```{r} sym.table <- classic.to.sym(x = ex_mcfa1, concept = suspect, default.categorical = sym.set) sym.table ``` ```{r} res <- sym.mcfa(sym.table, c(2,3)) mcfa.scatterplot(res[,2], res[,3], sym.data = sym.table, pos.var = c(2,3)) ``` ```{r} res <- sym.mcfa(sym.table, c(2,3,4)) mcfa.scatterplot(res[,2], res[,3], sym.data = sym.table, pos.var = c(2,3,4)) ``` # Symbolic UMAP ## Ejemplo Oils ```{r} datos <- oils datos ``` ```{r} x <- sym.umap(datos) x ``` ```{r} plot(x) ``` ## Ejemplo Cardiological ```{r} datos <- Cardiological datos ``` ```{r} x <- sym.umap(datos) x ``` ```{r} plot(x) ```