Multiple-Response Analysis: Cleaning of Duplicate Codes

classic Classic list List threaded Threaded
5 messages Options
Reply | Threaded
Open this post in threaded view
|

Multiple-Response Analysis: Cleaning of Duplicate Codes

G.Maubach
Hi All,

in my current project I am working with multiple-response questions
(MRSets):

-- Coding --
100 Main Code 1
110 Sub Code 1.1
120 Sub Code 1.2
130 Sub Code 1.3

200 Main Code 2
210 Sub Code 2.1
220 Sub Code 2.2
230 Sub Code 2.3

300 Main Code 3
310 Sub Code 3.1
320 Sub Code 3.2

The coding for the variables is to detailed. Therefore I have recoded all
sub codes to the respective main code, e.g. all 110, 120 and 130 to 100,
all 210, 220 and 230 to 200 and all 310, 320 and 330 to 300.

Now it happens that some respondents get several times the same main code.
If the coding was done for respondent 1 with 120 and 130 after recoding
the values are 100 and 100. If I count this, it would mean that I weight
the multiple values of this respondent by factor 2. This is not my aim. I
would like to count the 100 for the respective respondent only once.

Here is my script so far:

# -- cut --

library(expss)

d_sample <-
  structure(
    list(
      c05_01 = c(
        110,
        110,
        130,
        110,
        110,
        110,
        110,
        110,
        110,
        110,
        110,
        999,
        110,
        495,
        160,
        110,
        410
      ),
      c05_02 = c(NA,
                 NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
170,
                 NA, 130),
      c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
                 NA, NA, NA, NA, NA, NA, NA),
      c05_04 = c(
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_
      ),
      c05_05 = c(
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_
      )
    ),
    .Names = c("c05_01",
               "c05_02", "c05_03", "c05_04", "c05_05"),
    row.names = c(
      "1",
      "2",
      "3",
      "4",
      "5",
      "10",
      "11",
      "12",
      "13",
      "14",
      "15",
      "20",
      "21",
      "22",
      "23",
      "24",
      "25"
    ),
    class = "data.frame"
  )

c05_xx_r01 <- d_sample %>%
  select(starts_with("c05_")) %>%
  recode(c(
    110 %thru% 195 ~ 100,
    210 %thru% 295 ~ 200,
    310 %thru% 395 ~ 300,
    410 %thru% 495 ~ 400,
    510 %thru% 595 ~ 500,
    810 %thru% 895 ~ 800,
    910 %thru% 999 ~ 900))
names(c05_xx_r01) <- paste0("c05_0", 1:5, "_r01")
d_sample <- cbind(d_sample, c05_xx_r01)

# -- cut --

I would like to eliminate all duplicates codes, e. g. 100 and 100 for
respondents in row 3, 6, 13, 14 and 15 to 100 only once:

# -- cut --
d_sample_1 <-
  structure(
    list(
      c05_01 = c(
        110,
        110,
        130,
        110,
        110,
        110,
        110,
        110,
        110,
        110,
        110,
        999,
        110,
        495,
        160,
        110,
        410
      ),
      c05_02 = c(NA,
                 NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
170,
                 NA, 130),
      c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
                 NA, NA, NA, NA, NA, NA, NA),
      c05_04 = c(
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_
      ),
      c05_05 = c(
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_,
        NA_real_
      ),
      c05_01_r01 = c(
        100,
        100,
        100,
        100,
        100,
        100,
        100,
        100,
        100,
        100,
        100,
        900,
        100,
        400,
        100,
        100,
        400
      ),
      c05_02_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA,
                     NA, NA, NA, NA, NA, NA, NA, NA, 100),
      c05_03_r01 = c(NA, NA,
                     NA, NA, NA, NA, NA, NA, NA, 400, NA, NA, NA, NA, NA,
NA, NA),
      c05_04_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                     NA, NA, NA, NA, NA, NA),
      c05_05_r01 = c(NA, NA, NA, NA, NA,
                     NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
    ),
    .Names = c(
      "c05_01",
      "c05_02",
      "c05_03",
      "c05_04",
      "c05_05",
      "c05_01_r01",
      "c05_02_r01",
      "c05_03_r01",
      "c05_04_r01",
      "c05_05_r01"
    ),
    row.names = c(
      "1",
      "2",
      "3",
      "4",
      "5",
      "10",
      "11",
      "12",
      "13",
      "14",
      "15",
      "20",
      "21",
      "22",
      "23",
      "24",
      "25"
    ),
    class = "data.frame"
  )

# -- cut --

How could I achieve this?

Kind regards

Georg

______________________________________________
[hidden email] mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
Reply | Threaded
Open this post in threaded view
|

Re: Multiple-Response Analysis: Cleaning of Duplicate Codes

Bert Gunter-2
If I understand you correctly, one way is:

> z <- rep(LETTERS[1:3],4)
> z
 [1] "A" "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C"
> z[!duplicated(z)]
[1] "A" "B" "C"


?duplicated

-- Bert

Bert Gunter

"The trouble with having an open mind is that people keep coming along
and sticking things into it."
-- Opus (aka Berkeley Breathed in his "Bloom County" comic strip )


On Tue, Apr 25, 2017 at 9:36 AM,  <[hidden email]> wrote:

> Hi All,
>
> in my current project I am working with multiple-response questions
> (MRSets):
>
> -- Coding --
> 100 Main Code 1
> 110 Sub Code 1.1
> 120 Sub Code 1.2
> 130 Sub Code 1.3
>
> 200 Main Code 2
> 210 Sub Code 2.1
> 220 Sub Code 2.2
> 230 Sub Code 2.3
>
> 300 Main Code 3
> 310 Sub Code 3.1
> 320 Sub Code 3.2
>
> The coding for the variables is to detailed. Therefore I have recoded all
> sub codes to the respective main code, e.g. all 110, 120 and 130 to 100,
> all 210, 220 and 230 to 200 and all 310, 320 and 330 to 300.
>
> Now it happens that some respondents get several times the same main code.
> If the coding was done for respondent 1 with 120 and 130 after recoding
> the values are 100 and 100. If I count this, it would mean that I weight
> the multiple values of this respondent by factor 2. This is not my aim. I
> would like to count the 100 for the respective respondent only once.
>
> Here is my script so far:
>
> # -- cut --
>
> library(expss)
>
> d_sample <-
>   structure(
>     list(
>       c05_01 = c(
>         110,
>         110,
>         130,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         999,
>         110,
>         495,
>         160,
>         110,
>         410
>       ),
>       c05_02 = c(NA,
>                  NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
> 170,
>                  NA, 130),
>       c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
>                  NA, NA, NA, NA, NA, NA, NA),
>       c05_04 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_05 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       )
>     ),
>     .Names = c("c05_01",
>                "c05_02", "c05_03", "c05_04", "c05_05"),
>     row.names = c(
>       "1",
>       "2",
>       "3",
>       "4",
>       "5",
>       "10",
>       "11",
>       "12",
>       "13",
>       "14",
>       "15",
>       "20",
>       "21",
>       "22",
>       "23",
>       "24",
>       "25"
>     ),
>     class = "data.frame"
>   )
>
> c05_xx_r01 <- d_sample %>%
>   select(starts_with("c05_")) %>%
>   recode(c(
>     110 %thru% 195 ~ 100,
>     210 %thru% 295 ~ 200,
>     310 %thru% 395 ~ 300,
>     410 %thru% 495 ~ 400,
>     510 %thru% 595 ~ 500,
>     810 %thru% 895 ~ 800,
>     910 %thru% 999 ~ 900))
> names(c05_xx_r01) <- paste0("c05_0", 1:5, "_r01")
> d_sample <- cbind(d_sample, c05_xx_r01)
>
> # -- cut --
>
> I would like to eliminate all duplicates codes, e. g. 100 and 100 for
> respondents in row 3, 6, 13, 14 and 15 to 100 only once:
>
> # -- cut --
> d_sample_1 <-
>   structure(
>     list(
>       c05_01 = c(
>         110,
>         110,
>         130,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         999,
>         110,
>         495,
>         160,
>         110,
>         410
>       ),
>       c05_02 = c(NA,
>                  NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
> 170,
>                  NA, 130),
>       c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
>                  NA, NA, NA, NA, NA, NA, NA),
>       c05_04 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_05 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_01_r01 = c(
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         900,
>         100,
>         400,
>         100,
>         100,
>         400
>       ),
>       c05_02_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, NA, 100),
>       c05_03_r01 = c(NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, 400, NA, NA, NA, NA, NA,
> NA, NA),
>       c05_04_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA),
>       c05_05_r01 = c(NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
>     ),
>     .Names = c(
>       "c05_01",
>       "c05_02",
>       "c05_03",
>       "c05_04",
>       "c05_05",
>       "c05_01_r01",
>       "c05_02_r01",
>       "c05_03_r01",
>       "c05_04_r01",
>       "c05_05_r01"
>     ),
>     row.names = c(
>       "1",
>       "2",
>       "3",
>       "4",
>       "5",
>       "10",
>       "11",
>       "12",
>       "13",
>       "14",
>       "15",
>       "20",
>       "21",
>       "22",
>       "23",
>       "24",
>       "25"
>     ),
>     class = "data.frame"
>   )
>
> # -- cut --
>
> How could I achieve this?
>
> Kind regards
>
> Georg
>
> ______________________________________________
> [hidden email] mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.

______________________________________________
[hidden email] mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
Reply | Threaded
Open this post in threaded view
|

Re: Multiple-Response Analysis: Cleaning of Duplicate Codes

Boris Steipe
How about:

d_sample_1 <- floor(d_sample/100) * 100

for (i in 1:nrow(d_sample_1)) {
    d_sample_1[i, duplicated(unlist(d_sample_1[i, ]))] <- NA
}


B.


> On Apr 25, 2017, at 1:10 PM, Bert Gunter <[hidden email]> wrote:
>
> If I understand you correctly, one way is:
>
>> z <- rep(LETTERS[1:3],4)
>> z
> [1] "A" "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C"
>> z[!duplicated(z)]
> [1] "A" "B" "C"
>
>
> ?duplicated
>
> -- Bert
>
> Bert Gunter
>
> "The trouble with having an open mind is that people keep coming along
> and sticking things into it."
> -- Opus (aka Berkeley Breathed in his "Bloom County" comic strip )
>
>
> On Tue, Apr 25, 2017 at 9:36 AM,  <[hidden email]> wrote:
>> Hi All,
>>
>> in my current project I am working with multiple-response questions
>> (MRSets):
>>
>> -- Coding --
>> 100 Main Code 1
>> 110 Sub Code 1.1
>> 120 Sub Code 1.2
>> 130 Sub Code 1.3
>>
>> 200 Main Code 2
>> 210 Sub Code 2.1
>> 220 Sub Code 2.2
>> 230 Sub Code 2.3
>>
>> 300 Main Code 3
>> 310 Sub Code 3.1
>> 320 Sub Code 3.2
>>
>> The coding for the variables is to detailed. Therefore I have recoded all
>> sub codes to the respective main code, e.g. all 110, 120 and 130 to 100,
>> all 210, 220 and 230 to 200 and all 310, 320 and 330 to 300.
>>
>> Now it happens that some respondents get several times the same main code.
>> If the coding was done for respondent 1 with 120 and 130 after recoding
>> the values are 100 and 100. If I count this, it would mean that I weight
>> the multiple values of this respondent by factor 2. This is not my aim. I
>> would like to count the 100 for the respective respondent only once.
>>
>> Here is my script so far:
>>
>> # -- cut --
>>
>> library(expss)
>>
>> d_sample <-
>>  structure(
>>    list(
>>      c05_01 = c(
>>        110,
>>        110,
>>        130,
>>        110,
>>        110,
>>        110,
>>        110,
>>        110,
>>        110,
>>        110,
>>        110,
>>        999,
>>        110,
>>        495,
>>        160,
>>        110,
>>        410
>>      ),
>>      c05_02 = c(NA,
>>                 NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
>> 170,
>>                 NA, 130),
>>      c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
>>                 NA, NA, NA, NA, NA, NA, NA),
>>      c05_04 = c(
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_
>>      ),
>>      c05_05 = c(
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_
>>      )
>>    ),
>>    .Names = c("c05_01",
>>               "c05_02", "c05_03", "c05_04", "c05_05"),
>>    row.names = c(
>>      "1",
>>      "2",
>>      "3",
>>      "4",
>>      "5",
>>      "10",
>>      "11",
>>      "12",
>>      "13",
>>      "14",
>>      "15",
>>      "20",
>>      "21",
>>      "22",
>>      "23",
>>      "24",
>>      "25"
>>    ),
>>    class = "data.frame"
>>  )
>>
>> c05_xx_r01 <- d_sample %>%
>>  select(starts_with("c05_")) %>%
>>  recode(c(
>>    110 %thru% 195 ~ 100,
>>    210 %thru% 295 ~ 200,
>>    310 %thru% 395 ~ 300,
>>    410 %thru% 495 ~ 400,
>>    510 %thru% 595 ~ 500,
>>    810 %thru% 895 ~ 800,
>>    910 %thru% 999 ~ 900))
>> names(c05_xx_r01) <- paste0("c05_0", 1:5, "_r01")
>> d_sample <- cbind(d_sample, c05_xx_r01)
>>
>> # -- cut --
>>
>> I would like to eliminate all duplicates codes, e. g. 100 and 100 for
>> respondents in row 3, 6, 13, 14 and 15 to 100 only once:
>>
>> # -- cut --
>> d_sample_1 <-
>>  structure(
>>    list(
>>      c05_01 = c(
>>        110,
>>        110,
>>        130,
>>        110,
>>        110,
>>        110,
>>        110,
>>        110,
>>        110,
>>        110,
>>        110,
>>        999,
>>        110,
>>        495,
>>        160,
>>        110,
>>        410
>>      ),
>>      c05_02 = c(NA,
>>                 NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
>> 170,
>>                 NA, 130),
>>      c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
>>                 NA, NA, NA, NA, NA, NA, NA),
>>      c05_04 = c(
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_
>>      ),
>>      c05_05 = c(
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_,
>>        NA_real_
>>      ),
>>      c05_01_r01 = c(
>>        100,
>>        100,
>>        100,
>>        100,
>>        100,
>>        100,
>>        100,
>>        100,
>>        100,
>>        100,
>>        100,
>>        900,
>>        100,
>>        400,
>>        100,
>>        100,
>>        400
>>      ),
>>      c05_02_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA,
>>                     NA, NA, NA, NA, NA, NA, NA, NA, 100),
>>      c05_03_r01 = c(NA, NA,
>>                     NA, NA, NA, NA, NA, NA, NA, 400, NA, NA, NA, NA, NA,
>> NA, NA),
>>      c05_04_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
>>                     NA, NA, NA, NA, NA, NA),
>>      c05_05_r01 = c(NA, NA, NA, NA, NA,
>>                     NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
>>    ),
>>    .Names = c(
>>      "c05_01",
>>      "c05_02",
>>      "c05_03",
>>      "c05_04",
>>      "c05_05",
>>      "c05_01_r01",
>>      "c05_02_r01",
>>      "c05_03_r01",
>>      "c05_04_r01",
>>      "c05_05_r01"
>>    ),
>>    row.names = c(
>>      "1",
>>      "2",
>>      "3",
>>      "4",
>>      "5",
>>      "10",
>>      "11",
>>      "12",
>>      "13",
>>      "14",
>>      "15",
>>      "20",
>>      "21",
>>      "22",
>>      "23",
>>      "24",
>>      "25"
>>    ),
>>    class = "data.frame"
>>  )
>>
>> # -- cut --
>>
>> How could I achieve this?
>>
>> Kind regards
>>
>> Georg
>>
>> ______________________________________________
>> [hidden email] mailing list -- To UNSUBSCRIBE and more, see
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>
> ______________________________________________
> [hidden email] mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.

______________________________________________
[hidden email] mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
Reply | Threaded
Open this post in threaded view
|

Antwort: Re: Multiple-Response Analysis: Cleaning of Duplicate Codes (SOLVED)

G.Maubach
In reply to this post by Bert Gunter-2
Hi Bert,

many thanks for your reply. I appreciate your help a lot.

I would like to do the operation (= finding the duplicates) row-wise.

During this night a solution showed up in my dreams :) Instead of using
duplicates() to flag and filter the values I could use unique instead with
the same result. I tested:

# -- cut --

apply(X = c05_xx_r01, MARGIN = 1, unique)

# -- cut --

This finds the unique values for each row. That is nice but lacks the
requirement that I need a dataframe with a set of variables back that is
as long as the total amount of unique values for the complete
data.frame/matrix or the amount of variable of the original data.frame
respectively.

The result of the above operation gives a list instead of a data.frame due
to the fact that the amount of resulting values vary from 1 to 7.
Therefore no data.frame but a list is returned.

I search the web for a solution and found:

http://stackoverflow.com/questions/15753091/convert-mixed-length-named-list-to-data-frame

The complete solution would then look like:

# -- cut --

library(stringi)
library(tidyverse)
my_list <- apply(c05_xx_r01, MARGIN = 1, unique)
my_tibble <- as_tibble(stringi::stri_list2matrix(my_list, byrow = TRUE)
# DONE !

# -- cut --

All-in-all thanks again for your help.

Kind regards

Georg

P.S: I had a look into ?unique. The statement "unique(c05_xx_r01, MARGIN =
1) does not do the job, cause this looks for unique combinations of values
on all columns. But that is not the desired outcome.




Von:    Bert Gunter <[hidden email]>
An:     [hidden email],
Kopie:  R-help <[hidden email]>
Datum:  25.04.2017 19:10
Betreff:        Re: [R] Multiple-Response Analysis: Cleaning of Duplicate
Codes



If I understand you correctly, one way is:

> z <- rep(LETTERS[1:3],4)
> z
 [1] "A" "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C"
> z[!duplicated(z)]
[1] "A" "B" "C"


?duplicated

-- Bert

Bert Gunter

"The trouble with having an open mind is that people keep coming along
and sticking things into it."
-- Opus (aka Berkeley Breathed in his "Bloom County" comic strip )


On Tue, Apr 25, 2017 at 9:36 AM,  <[hidden email]> wrote:

> Hi All,
>
> in my current project I am working with multiple-response questions
> (MRSets):
>
> -- Coding --
> 100 Main Code 1
> 110 Sub Code 1.1
> 120 Sub Code 1.2
> 130 Sub Code 1.3
>
> 200 Main Code 2
> 210 Sub Code 2.1
> 220 Sub Code 2.2
> 230 Sub Code 2.3
>
> 300 Main Code 3
> 310 Sub Code 3.1
> 320 Sub Code 3.2
>
> The coding for the variables is to detailed. Therefore I have recoded
all
> sub codes to the respective main code, e.g. all 110, 120 and 130 to 100,
> all 210, 220 and 230 to 200 and all 310, 320 and 330 to 300.
>
> Now it happens that some respondents get several times the same main
code.
> If the coding was done for respondent 1 with 120 and 130 after recoding
> the values are 100 and 100. If I count this, it would mean that I weight
> the multiple values of this respondent by factor 2. This is not my aim.
I

> would like to count the 100 for the respective respondent only once.
>
> Here is my script so far:
>
> # -- cut --
>
> library(expss)
>
> d_sample <-
>   structure(
>     list(
>       c05_01 = c(
>         110,
>         110,
>         130,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         999,
>         110,
>         495,
>         160,
>         110,
>         410
>       ),
>       c05_02 = c(NA,
>                  NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
> 170,
>                  NA, 130),
>       c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
>                  NA, NA, NA, NA, NA, NA, NA),
>       c05_04 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_05 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       )
>     ),
>     .Names = c("c05_01",
>                "c05_02", "c05_03", "c05_04", "c05_05"),
>     row.names = c(
>       "1",
>       "2",
>       "3",
>       "4",
>       "5",
>       "10",
>       "11",
>       "12",
>       "13",
>       "14",
>       "15",
>       "20",
>       "21",
>       "22",
>       "23",
>       "24",
>       "25"
>     ),
>     class = "data.frame"
>   )
>
> c05_xx_r01 <- d_sample %>%
>   select(starts_with("c05_")) %>%
>   recode(c(
>     110 %thru% 195 ~ 100,
>     210 %thru% 295 ~ 200,
>     310 %thru% 395 ~ 300,
>     410 %thru% 495 ~ 400,
>     510 %thru% 595 ~ 500,
>     810 %thru% 895 ~ 800,
>     910 %thru% 999 ~ 900))
> names(c05_xx_r01) <- paste0("c05_0", 1:5, "_r01")
> d_sample <- cbind(d_sample, c05_xx_r01)
>
> # -- cut --
>
> I would like to eliminate all duplicates codes, e. g. 100 and 100 for
> respondents in row 3, 6, 13, 14 and 15 to 100 only once:
>
> # -- cut --
> d_sample_1 <-
>   structure(
>     list(
>       c05_01 = c(
>         110,
>         110,
>         130,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         999,
>         110,
>         495,
>         160,
>         110,
>         410
>       ),
>       c05_02 = c(NA,
>                  NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
> 170,
>                  NA, 130),
>       c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
>                  NA, NA, NA, NA, NA, NA, NA),
>       c05_04 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_05 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_01_r01 = c(
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         900,
>         100,
>         400,
>         100,
>         100,
>         400
>       ),
>       c05_02_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, NA, 100),
>       c05_03_r01 = c(NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, 400, NA, NA, NA, NA,
NA,

> NA, NA),
>       c05_04_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA),
>       c05_05_r01 = c(NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
>     ),
>     .Names = c(
>       "c05_01",
>       "c05_02",
>       "c05_03",
>       "c05_04",
>       "c05_05",
>       "c05_01_r01",
>       "c05_02_r01",
>       "c05_03_r01",
>       "c05_04_r01",
>       "c05_05_r01"
>     ),
>     row.names = c(
>       "1",
>       "2",
>       "3",
>       "4",
>       "5",
>       "10",
>       "11",
>       "12",
>       "13",
>       "14",
>       "15",
>       "20",
>       "21",
>       "22",
>       "23",
>       "24",
>       "25"
>     ),
>     class = "data.frame"
>   )
>
> # -- cut --
>
> How could I achieve this?
>
> Kind regards
>
> Georg
>
> ______________________________________________
> [hidden email] mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.


        [[alternative HTML version deleted]]

______________________________________________
[hidden email] mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
Reply | Threaded
Open this post in threaded view
|

Antwort: Re: Multiple-Response Analysis: Cleaning of Duplicate Codes (SOLVED)

G.Maubach
In reply to this post by Bert Gunter-2
Hi Bert,

many thanks for your reply. I appreciate your help a lot.

I would like to do the operation (= finding the duplicates) row-wise.

During this night a solution showed up in my dreams :) Instead of using
duplicates() to flag and filter the values I could use unique instead with
the same result. I tested:

# -- cut --

apply(X = c05_xx_r01, MARGIN = 1, unique)

# -- cut --

This finds the unique values for each row. That is nice but lacks the
requirement that I need a dataframe with a set of variables back that is
as long as the total amount of unique values for the complete
data.frame/matrix or the amount of variable of the original data.frame
respectively.

The result of the above operation gives a list instead of a data.frame due
to the fact that the amount of resulting values vary from 1 to 7.
Therefore no data.frame but a list is returned.

I search the web for a solution and found:

http://stackoverflow.com/questions/15753091/convert-mixed-length-named-list-to-data-frame

The complete solution would then look like:

# -- cut --

library(stringi)
library(tidyverse)
my_list <- apply(c05_xx_r01, MARGIN = 1, unique)
my_tibble <- as_tibble(stringi::stri_list2matrix(my_list, byrow = TRUE)
# DONE !

# -- cut --

All-in-all thanks again for your help.

Kind regards

Georg

P.S: I had a look into ?unique. The statement "unique(c05_xx_r01, MARGIN =
1) does not do the job, cause this looks for unique combinations of values
on all columns. But that is not the desired outcome.




Von:    Bert Gunter <[hidden email]>
An:     [hidden email],
Kopie:  R-help <[hidden email]>
Datum:  25.04.2017 19:10
Betreff:        Re: [R] Multiple-Response Analysis: Cleaning of Duplicate
Codes



If I understand you correctly, one way is:

> z <- rep(LETTERS[1:3],4)
> z
 [1] "A" "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C"
> z[!duplicated(z)]
[1] "A" "B" "C"


?duplicated

-- Bert

Bert Gunter

"The trouble with having an open mind is that people keep coming along
and sticking things into it."
-- Opus (aka Berkeley Breathed in his "Bloom County" comic strip )


On Tue, Apr 25, 2017 at 9:36 AM,  <[hidden email]> wrote:

> Hi All,
>
> in my current project I am working with multiple-response questions
> (MRSets):
>
> -- Coding --
> 100 Main Code 1
> 110 Sub Code 1.1
> 120 Sub Code 1.2
> 130 Sub Code 1.3
>
> 200 Main Code 2
> 210 Sub Code 2.1
> 220 Sub Code 2.2
> 230 Sub Code 2.3
>
> 300 Main Code 3
> 310 Sub Code 3.1
> 320 Sub Code 3.2
>
> The coding for the variables is to detailed. Therefore I have recoded
all
> sub codes to the respective main code, e.g. all 110, 120 and 130 to 100,
> all 210, 220 and 230 to 200 and all 310, 320 and 330 to 300.
>
> Now it happens that some respondents get several times the same main
code.
> If the coding was done for respondent 1 with 120 and 130 after recoding
> the values are 100 and 100. If I count this, it would mean that I weight
> the multiple values of this respondent by factor 2. This is not my aim.
I

> would like to count the 100 for the respective respondent only once.
>
> Here is my script so far:
>
> # -- cut --
>
> library(expss)
>
> d_sample <-
>   structure(
>     list(
>       c05_01 = c(
>         110,
>         110,
>         130,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         999,
>         110,
>         495,
>         160,
>         110,
>         410
>       ),
>       c05_02 = c(NA,
>                  NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
> 170,
>                  NA, 130),
>       c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
>                  NA, NA, NA, NA, NA, NA, NA),
>       c05_04 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_05 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       )
>     ),
>     .Names = c("c05_01",
>                "c05_02", "c05_03", "c05_04", "c05_05"),
>     row.names = c(
>       "1",
>       "2",
>       "3",
>       "4",
>       "5",
>       "10",
>       "11",
>       "12",
>       "13",
>       "14",
>       "15",
>       "20",
>       "21",
>       "22",
>       "23",
>       "24",
>       "25"
>     ),
>     class = "data.frame"
>   )
>
> c05_xx_r01 <- d_sample %>%
>   select(starts_with("c05_")) %>%
>   recode(c(
>     110 %thru% 195 ~ 100,
>     210 %thru% 295 ~ 200,
>     310 %thru% 395 ~ 300,
>     410 %thru% 495 ~ 400,
>     510 %thru% 595 ~ 500,
>     810 %thru% 895 ~ 800,
>     910 %thru% 999 ~ 900))
> names(c05_xx_r01) <- paste0("c05_0", 1:5, "_r01")
> d_sample <- cbind(d_sample, c05_xx_r01)
>
> # -- cut --
>
> I would like to eliminate all duplicates codes, e. g. 100 and 100 for
> respondents in row 3, 6, 13, 14 and 15 to 100 only once:
>
> # -- cut --
> d_sample_1 <-
>   structure(
>     list(
>       c05_01 = c(
>         110,
>         110,
>         130,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         110,
>         999,
>         110,
>         495,
>         160,
>         110,
>         410
>       ),
>       c05_02 = c(NA,
>                  NA, 120, NA, NA, 150, NA, NA, 170, 160, NA, NA, NA, NA,
> 170,
>                  NA, 130),
>       c05_03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 410,
>                  NA, NA, NA, NA, NA, NA, NA),
>       c05_04 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_05 = c(
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_,
>         NA_real_
>       ),
>       c05_01_r01 = c(
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         100,
>         900,
>         100,
>         400,
>         100,
>         100,
>         400
>       ),
>       c05_02_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, NA, 100),
>       c05_03_r01 = c(NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, 400, NA, NA, NA, NA,
NA,

> NA, NA),
>       c05_04_r01 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA),
>       c05_05_r01 = c(NA, NA, NA, NA, NA,
>                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
>     ),
>     .Names = c(
>       "c05_01",
>       "c05_02",
>       "c05_03",
>       "c05_04",
>       "c05_05",
>       "c05_01_r01",
>       "c05_02_r01",
>       "c05_03_r01",
>       "c05_04_r01",
>       "c05_05_r01"
>     ),
>     row.names = c(
>       "1",
>       "2",
>       "3",
>       "4",
>       "5",
>       "10",
>       "11",
>       "12",
>       "13",
>       "14",
>       "15",
>       "20",
>       "21",
>       "22",
>       "23",
>       "24",
>       "25"
>     ),
>     class = "data.frame"
>   )
>
> # -- cut --
>
> How could I achieve this?
>
> Kind regards
>
> Georg
>
> ______________________________________________
> [hidden email] mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.

______________________________________________
[hidden email] mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.