tidyverse : Converting character columns to factor using mutate_if and mutate_each_
- sam33frodon
- Dec 28, 2020
- 6 min read
Updated: Jan 27, 2021
library(readr)
adult <- read_csv("adultincome.csv")
## Parsed with column specification:
## cols(
## age = col_double(),
## workclass = col_character(),
## fnlwgt = col_double(),
## education = col_character(),
## education.num = col_double(),
## marital.status = col_character(),
## occupation = col_character(),
## relationship = col_character(),
## race = col_character(),
## sex = col_character(),
## capital.gain = col_double(),
## capital.loss = col_double(),
## hours.per.week = col_double(),
## native.country = col_character(),
## income = col_character()
## )
str(adult)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 32561 obs. of 15 variables:
## $ age : num 90 82 66 54 41 34 38 74 68 41 ...
## $ workclass : chr "?" "Private" "?" "Private" ...
## $ fnlwgt : num 77053 132870 186061 140359 264663 ...
## $ education : chr "HS-grad" "HS-grad" "Some-college" "7th-8th" ...
## $ education.num : num 9 9 10 4 10 9 6 16 9 10 ...
## $ marital.status: chr "Widowed" "Widowed" "Widowed" "Divorced" ...
## $ occupation : chr "?" "Exec-managerial" "?" "Machine-op-inspct" ...
## $ relationship : chr "Not-in-family" "Not-in-family" "Unmarried" "Unmarried" ...
## $ race : chr "White" "White" "Black" "White" ...
## $ sex : chr "Female" "Female" "Female" "Female" ...
## $ capital.gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ capital.loss : num 4356 4356 4356 3900 3900 ...
## $ hours.per.week: num 40 18 40 40 40 45 40 20 40 60 ...
## $ native.country: chr "United-States" "United-States" "United-States" "United-States" ...
## $ income : chr "<=50K" "<=50K" "<=50K" "<=50K" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. workclass = col_character(),
## .. fnlwgt = col_double(),
## .. education = col_character(),
## .. education.num = col_double(),
## .. marital.status = col_character(),
## .. occupation = col_character(),
## .. relationship = col_character(),
## .. race = col_character(),
## .. sex = col_character(),
## .. capital.gain = col_double(),
## .. capital.loss = col_double(),
## .. hours.per.week = col_double(),
## .. native.country = col_character(),
## .. income = col_character()
## .. )
library(tidyverse)
1. Method 1: Using mutate_if
adult1 <- adult %>%
mutate_if(is.character,as.factor)
str(adult1)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 32561 obs. of 15 variables:
## $ age : num 90 82 66 54 41 34 38 74 68 41 ...
## $ workclass : Factor w/ 9 levels "?","Federal-gov",..: 1 5 1 5 5 5 5 8 2 5 ...
## $ fnlwgt : num 77053 132870 186061 140359 264663 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 12 12 16 6 16 12 1 11 12 16 ...
## $ education.num : num 9 9 10 4 10 9 6 16 9 10 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 7 7 7 1 6 1 6 5 1 5 ...
## $ occupation : Factor w/ 15 levels "?","Adm-clerical",..: 1 5 1 8 11 9 2 11 11 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 2 5 5 4 5 5 3 2 5 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 3 5 5 5 5 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 2 1 1 2 ...
## $ capital.gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ capital.loss : num 4356 4356 4356 3900 3900 ...
## $ hours.per.week: num 40 18 40 40 40 45 40 20 40 60 ...
## $ native.country: Factor w/ 42 levels "?","Cambodia",..: 40 40 40 40 40 40 40 40 40 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 1 2 ...
2. Method 2: Using mutate_each_
cols <- c("workclass", "education", "marital.status", "occupation","relationship",
"race","sex", "native.country","income")
adult2 <- adult %>%
mutate_each_(funs(factor(.)),cols)
## Warning: mutate_each() is deprecated
## Please use mutate_if(), mutate_at(), or mutate_all() instead:
##
## - To map `funs` over all variables, use mutate_all()
## - To map `funs` over a selection of variables, use mutate_at()
## This warning is displayed once per session.
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
str(adult2)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 32561 obs. of 15 variables:
## $ age : num 90 82 66 54 41 34 38 74 68 41 ...
## $ workclass : Factor w/ 9 levels "?","Federal-gov",..: 1 5 1 5 5 5 5 8 2 5 ...
## $ fnlwgt : num 77053 132870 186061 140359 264663 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 12 12 16 6 16 12 1 11 12 16 ...
## $ education.num : num 9 9 10 4 10 9 6 16 9 10 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 7 7 7 1 6 1 6 5 1 5 ...
## $ occupation : Factor w/ 15 levels "?","Adm-clerical",..: 1 5 1 8 11 9 2 11 11 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 2 5 5 4 5 5 3 2 5 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 3 5 5 5 5 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 2 1 1 2 ...
## $ capital.gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ capital.loss : num 4356 4356 4356 3900 3900 ...
## $ hours.per.week: num 40 18 40 40 40 45 40 20 40 60 ...
## $ native.country: Factor w/ 42 levels "?","Cambodia",..: 40 40 40 40 40 40 40 40 40 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 1 2 ...
Method 3: Using lappy function
adult3 <- adult
adult3[cols] <- lapply(adult3[cols], factor)
str(adult3)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 32561 obs. of 15 variables:
## $ age : num 90 82 66 54 41 34 38 74 68 41 ...
## $ workclass : Factor w/ 9 levels "?","Federal-gov",..: 1 5 1 5 5 5 5 8 2 5 ...
## $ fnlwgt : num 77053 132870 186061 140359 264663 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 12 12 16 6 16 12 1 11 12 16 ...
## $ education.num : num 9 9 10 4 10 9 6 16 9 10 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 7 7 7 1 6 1 6 5 1 5 ...
## $ occupation : Factor w/ 15 levels "?","Adm-clerical",..: 1 5 1 8 11 9 2 11 11 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 2 5 5 4 5 5 3 2 5 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 3 5 5 5 5 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 2 1 1 2 ...
## $ capital.gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ capital.loss : num 4356 4356 4356 3900 3900 ...
## $ hours.per.week: num 40 18 40 40 40 45 40 20 40 60 ...
## $ native.country: Factor w/ 42 levels "?","Cambodia",..: 40 40 40 40 40 40 40 40 40 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 1 2 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. workclass = col_character(),
## .. fnlwgt = col_double(),
## .. education = col_character(),
## .. education.num = col_double(),
## .. marital.status = col_character(),
## .. occupation = col_character(),
## .. relationship = col_character(),
## .. race = col_character(),
## .. sex = col_character(),
## .. capital.gain = col_double(),
## .. capital.loss = col_double(),
## .. hours.per.week = col_double(),
## .. native.country = col_character(),
## .. income = col_character()
## .. )
Comentarios