Julia: DataFramesMeta Transformation

I am trying to reproduce the following R-codes in Julia

library(dplyr)

women_new <- rbind(women, c(NA, 1), c(NA, NA))
women_new %>% 
  filter(height %>% complete.cases) %>%
  mutate(sector = character(n()),
         sector = replace(sector, height >= 0 & height <= 60, "1"),
         sector = replace(sector, height >= 61 & height <= 67, "2"), 
         sector = replace(sector, height >= 68 & height <= 72, "3"))

      

My attempts at Julia are as follows:

using DataFrames
using DataFramesMeta
using Lazy
using RDatasets

women = @> begin
  "datasets" 
  dataset("women")
  DataArray()
  vcat([[NA NA]; [NA NA]])
end

women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);
women_new[16, 2] = 1;

      

My first question is, is there a way to type 1

directly on vcat([[NA 1]; [NA NA]])

, like in R? It returns the following error if I do:

MethodError: Cannot `convert` an object of type DataArrays.NAtype to an object of type Int64
This may have arisen from a call to the constructor Int64(...),
since type constructors fall back to convert methods.
 in macro expansion at multidimensional.jl:431 [inlined]
 in macro expansion at cartesian.jl:64 [inlined]
 in macro expansion at multidimensional.jl:429 [inlined]
 in _unsafe_batchsetindex!(::Array{Int64,2}, ::Base.Repeated{DataArrays.NAtype}, ::UnitRange{Int64}, ::UnitRange{Int64}) at multidimensional.jl:421
 in setindex!(::Array{Int64,2}, ::DataArrays.NAtype, ::UnitRange{Int64}, ::UnitRange{Int64}) at abstractarray.jl:832
 in cat_t(::Int64, ::Type{T}, ::DataArrays.NAtype, ::Vararg{Any,N}) at abstractarray.jl:1098
 in hcat(::DataArrays.NAtype, ::Int64) at abstractarray.jl:1180
 in include_string(::String, ::String) at loading.jl:441
 in include_string(::String, ::String, ::Int64) at eval.jl:30
 in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
 in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
 in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
 in withpath(::Function, ::String) at eval.jl:38
 in macro expansion at eval.jl:49 [inlined]
 in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60

      

My second question is, is there a way to convert DataArray

to DataFrame

? In this case, the column names are X1

, X2

, ...

or any default name DataFrame

, because DataArray

no column names. I think this is more neat than typing the following:

women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);

      

I would like to just do convert(DataFrame, women)

and just rename the column names. But this conversion doesn't work. And here is my attempt at converting or mutating in the case of R.

@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Sector = NA,
    Sector = ifelse(:Height .>=  0 & :Height .<= 60, 1,
             ifelse(:Height .>= 61 & :Height .<= 67, 2,
             ifelse(:Height .>= 68 & :Height .<= 72, 3, NA)))
    )
end

      

But this will return:

15Γ—3 DataFrames.DataFrame
β”‚ Row β”‚ Height β”‚ Weight β”‚ Sectorβ”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ 58     β”‚ 115    β”‚ 1     β”‚
β”‚ 2   β”‚ 59     β”‚ 117    β”‚ 1     β”‚
β”‚ 3   β”‚ 60     β”‚ 120    β”‚ 1     β”‚
β”‚ 4   β”‚ 61     β”‚ 123    β”‚ 1     β”‚
β”‚ 5   β”‚ 62     β”‚ 126    β”‚ 1     β”‚
β”‚ 6   β”‚ 63     β”‚ 129    β”‚ 1     β”‚
β”‚ 7   β”‚ 64     β”‚ 132    β”‚ 1     β”‚
β”‚ 8   β”‚ 65     β”‚ 135    β”‚ 1     β”‚
β”‚ 9   β”‚ 66     β”‚ 139    β”‚ 1     β”‚
β”‚ 10  β”‚ 67     β”‚ 142    β”‚ 1     β”‚
β”‚ 11  β”‚ 68     β”‚ 146    β”‚ 1     β”‚
β”‚ 12  β”‚ 69     β”‚ 150    β”‚ 1     β”‚
β”‚ 13  β”‚ 70     β”‚ 154    β”‚ 1     β”‚
β”‚ 14  β”‚ 71     β”‚ 159    β”‚ 1     β”‚
β”‚ 15  β”‚ 72     β”‚ 164    β”‚ 1     β”‚

      

which is not equivalent to R, I have also tried the following:

@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Sector = NA,
    Sector = :Height .>=  0 & :Height .<= 60 ? 1 :
             :Height .>= 61 & :Height .<= 67 ? 2 :
             :Height .>= 68 & :Height .<= 72 ? 3 :
            NA;
    )
end

      

But it returns the following error:

TypeError: non-boolean (DataArrays.DataArray{Bool,1}) used in boolean context
 in (::###469#303)(::DataArrays.DataArray{Int64,1}) at DataFramesMeta.jl:55
 in (::##298#302)(::DataFrames.DataFrame) at DataFramesMeta.jl:295
 in #transform#38(::Array{Any,1}, ::Function, ::DataFrames.DataFrame) at DataFramesMeta.jl:270
 in (::DataFramesMeta.#kw##transform)(::Array{Any,1}, ::DataFramesMeta.#transform, ::DataFrames.DataFrame) at <missing>:0
 in include_string(::String, ::String) at loading.jl:441
 in include_string(::String, ::String, ::Int64) at eval.jl:30
 in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
 in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
 in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
 in withpath(::Function, ::String) at eval.jl:38
 in macro expansion at eval.jl:49 [inlined]
 in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60

      

I appreciate if you can help me figure this out. Finally, my final question is, is there a way to shorten my code like in R, but still elegant?

+3


source to share


1 answer


I understood. There is an effect on operator precedence, I thought the parentheses were unnecessary.

using DataFrames
using DataFramesMeta
using Lazy
using RDatasets

women = dataset("datasets", "women");
women_new = vcat(
              women,
              DataFrame(Height = [NA; NA], Weight = @data [1; NA])
            )

@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Class = NA,
    Class = ifelse((:Height .>=  0) & (:Height .<= 60), 1,
            ifelse((:Height .>= 61) & (:Height .<= 67), 2,
            ifelse((:Height .>= 68) & (:Height .<= 72), 3, NA)))
            )
end

      

Update: The above code can be simplified further:

@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Class = @> begin
      function (x)
         0 <= x <= 60 ?  1 :
        61 <= x <= 67 ?  2 :
        68 <= x <= 72 ?  3 :
        NA
      end
      map(:Height)
    end
  )
end

      

Or an alternative is to use Query.jl like this:

using DataFrames
using Query
using RDatasets

women = dataset("datasets", "women");
women_new = vcat(
              women,
              DataFrame(Height = [NA; NA], Weight = @data [1; NA])
            )

@from i in women_new begin
    @where !isnull(i.Height)
    @select {
        i.Height, i.Weight,
        class = 0 <= i.Height <= 60 ?  1 :
               61 <= i.Height <= 67 ?  2 :
               68 <= i.Height <= 72 ?  3 :
                0
    }
    @collect DataFrame
end

      

The result is now correct:

15Γ—3 DataFrames.DataFrame
β”‚ Row β”‚ Height β”‚ Weight β”‚ Class β”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ 58     β”‚ 115    β”‚ 1     β”‚
β”‚ 2   β”‚ 59     β”‚ 117    β”‚ 1     β”‚
β”‚ 3   β”‚ 60     β”‚ 120    β”‚ 1     β”‚
β”‚ 4   β”‚ 61     β”‚ 123    β”‚ 2     β”‚
β”‚ 5   β”‚ 62     β”‚ 126    β”‚ 2     β”‚
β”‚ 6   β”‚ 63     β”‚ 129    β”‚ 2     β”‚
β”‚ 7   β”‚ 64     β”‚ 132    β”‚ 2     β”‚
β”‚ 8   β”‚ 65     β”‚ 135    β”‚ 2     β”‚
β”‚ 9   β”‚ 66     β”‚ 139    β”‚ 2     β”‚
β”‚ 10  β”‚ 67     β”‚ 142    β”‚ 2     β”‚
β”‚ 11  β”‚ 68     β”‚ 146    β”‚ 3     β”‚
β”‚ 12  β”‚ 69     β”‚ 150    β”‚ 3     β”‚
β”‚ 13  β”‚ 70     β”‚ 154    β”‚ 3     β”‚
β”‚ 14  β”‚ 71     β”‚ 159    β”‚ 3     β”‚
β”‚ 15  β”‚ 72     β”‚ 164    β”‚ 3     β”‚

      

If we don't want to filter NA and work with complete data, then the best I can do:



@> begin
  women_new
  @transform(
    Height_New = NA,
    Height_New = ifelse(isna(:Height), -1, :Height))
  @transform(
    Class = NA,
    Class = ifelse(:Height_New == -1, NA,
              ifelse((:Height_New .>=  0) & (:Height_New .<= 60), 1,
              ifelse((:Height_New .>= 61) & (:Height_New .<= 67), 2,
              ifelse((:Height_New .>= 68) & (:Height_New .<= 72), 3, NA))))
  )
  delete!(:Height_New)
end

      

Update: The above code can be simplified further:

@> begin
    women_new
    @transform(
        Class = @> begin
            function (x)
                isna(x)       ? NA :
                 0 <= x <= 60 ?  1 :
                61 <= x <= 67 ?  2 :
                68 <= x <= 72 ?  3 :
                NA
            end
            map(:Height)
        end
    )
end

      

Or an alternative is to use Query.jl like this:

@from i in women_new begin
    @select {
        i.Height, i.Weight,
        class = 0 <= i.Height <= 60 ?  1 :
               61 <= i.Height <= 67 ?  2 :
               68 <= i.Height <= 72 ?  3 :
                0
    }
    @collect DataFrame
end

      

Output:

17Γ—3 DataFrames.DataFrame
β”‚ Row β”‚ Height β”‚ Weight β”‚ Class β”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ 58     β”‚ 115    β”‚ 1     β”‚
β”‚ 2   β”‚ 59     β”‚ 117    β”‚ 1     β”‚
β”‚ 3   β”‚ 60     β”‚ 120    β”‚ 1     β”‚
β”‚ 4   β”‚ 61     β”‚ 123    β”‚ 2     β”‚
β”‚ 5   β”‚ 62     β”‚ 126    β”‚ 2     β”‚
β”‚ 6   β”‚ 63     β”‚ 129    β”‚ 2     β”‚
β”‚ 7   β”‚ 64     β”‚ 132    β”‚ 2     β”‚
β”‚ 8   β”‚ 65     β”‚ 135    β”‚ 2     β”‚
β”‚ 9   β”‚ 66     β”‚ 139    β”‚ 2     β”‚
β”‚ 10  β”‚ 67     β”‚ 142    β”‚ 2     β”‚
β”‚ 11  β”‚ 68     β”‚ 146    β”‚ 3     β”‚
β”‚ 12  β”‚ 69     β”‚ 150    β”‚ 3     β”‚
β”‚ 13  β”‚ 70     β”‚ 154    β”‚ 3     β”‚
β”‚ 14  β”‚ 71     β”‚ 159    β”‚ 3     β”‚
β”‚ 15  β”‚ 72     β”‚ 164    β”‚ 3     β”‚
β”‚ 16  β”‚ NA     β”‚ 1      β”‚ NA    β”‚
β”‚ 17  β”‚ NA     β”‚ NA     β”‚ NA    β”‚

      

In this case, the code gets messy because there is no way to handle the NA in the first argument yet ifelse

.

+2


source







All Articles