Julia: DataFramesMeta Transformation
I am trying to reproduce the following R-codes in Julia
library(dplyr)
women_new <- rbind(women, c(NA, 1), c(NA, NA))
women_new %>%
filter(height %>% complete.cases) %>%
mutate(sector = character(n()),
sector = replace(sector, height >= 0 & height <= 60, "1"),
sector = replace(sector, height >= 61 & height <= 67, "2"),
sector = replace(sector, height >= 68 & height <= 72, "3"))
My attempts at Julia are as follows:
using DataFrames
using DataFramesMeta
using Lazy
using RDatasets
women = @> begin
"datasets"
dataset("women")
DataArray()
vcat([[NA NA]; [NA NA]])
end
women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);
women_new[16, 2] = 1;
My first question is, is there a way to type 1
directly on vcat([[NA 1]; [NA NA]])
, like in R? It returns the following error if I do:
MethodError: Cannot `convert` an object of type DataArrays.NAtype to an object of type Int64
This may have arisen from a call to the constructor Int64(...),
since type constructors fall back to convert methods.
in macro expansion at multidimensional.jl:431 [inlined]
in macro expansion at cartesian.jl:64 [inlined]
in macro expansion at multidimensional.jl:429 [inlined]
in _unsafe_batchsetindex!(::Array{Int64,2}, ::Base.Repeated{DataArrays.NAtype}, ::UnitRange{Int64}, ::UnitRange{Int64}) at multidimensional.jl:421
in setindex!(::Array{Int64,2}, ::DataArrays.NAtype, ::UnitRange{Int64}, ::UnitRange{Int64}) at abstractarray.jl:832
in cat_t(::Int64, ::Type{T}, ::DataArrays.NAtype, ::Vararg{Any,N}) at abstractarray.jl:1098
in hcat(::DataArrays.NAtype, ::Int64) at abstractarray.jl:1180
in include_string(::String, ::String) at loading.jl:441
in include_string(::String, ::String, ::Int64) at eval.jl:30
in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
in withpath(::Function, ::String) at eval.jl:38
in macro expansion at eval.jl:49 [inlined]
in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60
My second question is, is there a way to convert DataArray
to DataFrame
? In this case, the column names are X1
, X2
, ...
or any default name DataFrame
, because DataArray
no column names. I think this is more neat than typing the following:
women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);
I would like to just do convert(DataFrame, women)
and just rename the column names. But this conversion doesn't work. And here is my attempt at converting or mutating in the case of R.
@> begin
women_new
@where !isna(:Height)
@transform(
Sector = NA,
Sector = ifelse(:Height .>= 0 & :Height .<= 60, 1,
ifelse(:Height .>= 61 & :Height .<= 67, 2,
ifelse(:Height .>= 68 & :Height .<= 72, 3, NA)))
)
end
But this will return:
15Γ3 DataFrames.DataFrame
β Row β Height β Weight β Sectorβ
βββββββΌβββββββββΌβββββββββΌββββββββ€
β 1 β 58 β 115 β 1 β
β 2 β 59 β 117 β 1 β
β 3 β 60 β 120 β 1 β
β 4 β 61 β 123 β 1 β
β 5 β 62 β 126 β 1 β
β 6 β 63 β 129 β 1 β
β 7 β 64 β 132 β 1 β
β 8 β 65 β 135 β 1 β
β 9 β 66 β 139 β 1 β
β 10 β 67 β 142 β 1 β
β 11 β 68 β 146 β 1 β
β 12 β 69 β 150 β 1 β
β 13 β 70 β 154 β 1 β
β 14 β 71 β 159 β 1 β
β 15 β 72 β 164 β 1 β
which is not equivalent to R, I have also tried the following:
@> begin
women_new
@where !isna(:Height)
@transform(
Sector = NA,
Sector = :Height .>= 0 & :Height .<= 60 ? 1 :
:Height .>= 61 & :Height .<= 67 ? 2 :
:Height .>= 68 & :Height .<= 72 ? 3 :
NA;
)
end
But it returns the following error:
TypeError: non-boolean (DataArrays.DataArray{Bool,1}) used in boolean context
in (::###469#303)(::DataArrays.DataArray{Int64,1}) at DataFramesMeta.jl:55
in (::##298#302)(::DataFrames.DataFrame) at DataFramesMeta.jl:295
in #transform#38(::Array{Any,1}, ::Function, ::DataFrames.DataFrame) at DataFramesMeta.jl:270
in (::DataFramesMeta.#kw##transform)(::Array{Any,1}, ::DataFramesMeta.#transform, ::DataFrames.DataFrame) at <missing>:0
in include_string(::String, ::String) at loading.jl:441
in include_string(::String, ::String, ::Int64) at eval.jl:30
in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
in withpath(::Function, ::String) at eval.jl:38
in macro expansion at eval.jl:49 [inlined]
in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60
I appreciate if you can help me figure this out. Finally, my final question is, is there a way to shorten my code like in R, but still elegant?
source to share
I understood. There is an effect on operator precedence, I thought the parentheses were unnecessary.
using DataFrames
using DataFramesMeta
using Lazy
using RDatasets
women = dataset("datasets", "women");
women_new = vcat(
women,
DataFrame(Height = [NA; NA], Weight = @data [1; NA])
)
@> begin
women_new
@where !isna(:Height)
@transform(
Class = NA,
Class = ifelse((:Height .>= 0) & (:Height .<= 60), 1,
ifelse((:Height .>= 61) & (:Height .<= 67), 2,
ifelse((:Height .>= 68) & (:Height .<= 72), 3, NA)))
)
end
Update: The above code can be simplified further:
@> begin
women_new
@where !isna(:Height)
@transform(
Class = @> begin
function (x)
0 <= x <= 60 ? 1 :
61 <= x <= 67 ? 2 :
68 <= x <= 72 ? 3 :
NA
end
map(:Height)
end
)
end
Or an alternative is to use Query.jl like this:
using DataFrames
using Query
using RDatasets
women = dataset("datasets", "women");
women_new = vcat(
women,
DataFrame(Height = [NA; NA], Weight = @data [1; NA])
)
@from i in women_new begin
@where !isnull(i.Height)
@select {
i.Height, i.Weight,
class = 0 <= i.Height <= 60 ? 1 :
61 <= i.Height <= 67 ? 2 :
68 <= i.Height <= 72 ? 3 :
0
}
@collect DataFrame
end
The result is now correct:
15Γ3 DataFrames.DataFrame
β Row β Height β Weight β Class β
βββββββΌβββββββββΌβββββββββΌββββββββ€
β 1 β 58 β 115 β 1 β
β 2 β 59 β 117 β 1 β
β 3 β 60 β 120 β 1 β
β 4 β 61 β 123 β 2 β
β 5 β 62 β 126 β 2 β
β 6 β 63 β 129 β 2 β
β 7 β 64 β 132 β 2 β
β 8 β 65 β 135 β 2 β
β 9 β 66 β 139 β 2 β
β 10 β 67 β 142 β 2 β
β 11 β 68 β 146 β 3 β
β 12 β 69 β 150 β 3 β
β 13 β 70 β 154 β 3 β
β 14 β 71 β 159 β 3 β
β 15 β 72 β 164 β 3 β
If we don't want to filter NA and work with complete data, then the best I can do:
@> begin
women_new
@transform(
Height_New = NA,
Height_New = ifelse(isna(:Height), -1, :Height))
@transform(
Class = NA,
Class = ifelse(:Height_New == -1, NA,
ifelse((:Height_New .>= 0) & (:Height_New .<= 60), 1,
ifelse((:Height_New .>= 61) & (:Height_New .<= 67), 2,
ifelse((:Height_New .>= 68) & (:Height_New .<= 72), 3, NA))))
)
delete!(:Height_New)
end
Update: The above code can be simplified further:
@> begin
women_new
@transform(
Class = @> begin
function (x)
isna(x) ? NA :
0 <= x <= 60 ? 1 :
61 <= x <= 67 ? 2 :
68 <= x <= 72 ? 3 :
NA
end
map(:Height)
end
)
end
Or an alternative is to use Query.jl like this:
@from i in women_new begin
@select {
i.Height, i.Weight,
class = 0 <= i.Height <= 60 ? 1 :
61 <= i.Height <= 67 ? 2 :
68 <= i.Height <= 72 ? 3 :
0
}
@collect DataFrame
end
Output:
17Γ3 DataFrames.DataFrame
β Row β Height β Weight β Class β
βββββββΌβββββββββΌβββββββββΌββββββββ€
β 1 β 58 β 115 β 1 β
β 2 β 59 β 117 β 1 β
β 3 β 60 β 120 β 1 β
β 4 β 61 β 123 β 2 β
β 5 β 62 β 126 β 2 β
β 6 β 63 β 129 β 2 β
β 7 β 64 β 132 β 2 β
β 8 β 65 β 135 β 2 β
β 9 β 66 β 139 β 2 β
β 10 β 67 β 142 β 2 β
β 11 β 68 β 146 β 3 β
β 12 β 69 β 150 β 3 β
β 13 β 70 β 154 β 3 β
β 14 β 71 β 159 β 3 β
β 15 β 72 β 164 β 3 β
β 16 β NA β 1 β NA β
β 17 β NA β NA β NA β
In this case, the code gets messy because there is no way to handle the NA in the first argument yet ifelse
.
source to share