Daten bearbeiten und zusammenfassen
Daten aus Verhaltensexperiments bearbeiten und zusammenfassen, Datenpunkte identifizieren.
Ob eine Variable als factor definiert ist, wird als Attribut gespeichert. Attribute werden aber in einem .csv. File nicht mitgespeichert; deshalb müssen wir die Gruppierungsvariablen wieder als factor definieren.
data <- data |>
    mutate_if(is.character, as.factor)glimpse(data)Rows: 1,440
Columns: 9
$ trial     <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
$ ID        <fct> JH, JH, JH, JH, JH, JH, JH, JH, JH, JH, JH, JH, JH, JH, JH, …
$ cue       <fct> right, right, none, none, left, none, none, left, left, none…
$ direction <fct> right, right, right, right, left, right, left, left, right, …
$ response  <dbl> 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
$ rt        <dbl> 0.7136441, 0.6271285, 0.6703410, 0.5738488, 0.8405913, 0.667…
$ choice    <fct> right, right, left, right, right, right, right, left, left, …
$ correct   <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ condition <fct> valid, valid, neutral, neutral, valid, neutral, neutral, val…
Binary Choices
Pro Versuchsperson
data# A tibble: 1,440 × 9
   trial ID    cue   direction response    rt choice correct condition
   <dbl> <fct> <fct> <fct>        <dbl> <dbl> <fct>    <dbl> <fct>    
 1     0 JH    right right            1 0.714 right        1 valid    
 2     1 JH    right right            1 0.627 right        1 valid    
 3     2 JH    none  right            0 0.670 left         0 neutral  
 4     3 JH    none  right            1 0.574 right        1 neutral  
 5     4 JH    left  left             1 0.841 right        0 valid    
 6     5 JH    none  right            1 0.668 right        1 neutral  
 7     6 JH    none  left             1 1.12  right        0 neutral  
 8     7 JH    left  left             0 0.640 left         1 valid    
 9     8 JH    left  right            0 1.13  left         0 invalid  
10     9 JH    none  right            1 1.03  right        1 neutral  
# … with 1,430 more rows
data |> 
  group_by(ID, condition)# A tibble: 1,440 × 9
# Groups:   ID, condition [27]
   trial ID    cue   direction response    rt choice correct condition
   <dbl> <fct> <fct> <fct>        <dbl> <dbl> <fct>    <dbl> <fct>    
 1     0 JH    right right            1 0.714 right        1 valid    
 2     1 JH    right right            1 0.627 right        1 valid    
 3     2 JH    none  right            0 0.670 left         0 neutral  
 4     3 JH    none  right            1 0.574 right        1 neutral  
 5     4 JH    left  left             1 0.841 right        0 valid    
 6     5 JH    none  right            1 0.668 right        1 neutral  
 7     6 JH    none  left             1 1.12  right        0 neutral  
 8     7 JH    left  left             0 0.640 left         1 valid    
 9     8 JH    left  right            0 1.13  left         0 invalid  
10     9 JH    none  right            1 1.03  right        1 neutral  
# … with 1,430 more rows
accuracy# A tibble: 27 × 5
# Groups:   ID [9]
   ID    condition     N ncorrect accuracy
   <fct> <fct>     <int>    <dbl>    <dbl>
 1 JH    invalid      16       13   0.812 
 2 JH    neutral      80       66   0.825 
 3 JH    valid        64       60   0.938 
 4 NS    invalid      16       11   0.688 
 5 NS    neutral      80       56   0.7   
 6 NS    valid        64       58   0.906 
 7 rh    invalid      16        2   0.125 
 8 rh    neutral      80       64   0.8   
 9 rh    valid        64       61   0.953 
10 sb    invalid      16        1   0.0625
# … with 17 more rows
Visualisieren
accuracy |> 
  ggplot(aes(x = condition, y = accuracy, fill = condition)) +
  geom_col() +
  geom_line(aes(group = ID), size = 2) +
  geom_point(size = 8) +
  scale_fill_manual(
    values = c(invalid = "#9E0142",
    neutral = "#C4C4B7",
    valid = "#2EC762")
  ) +
  labs(
    x = "Cue",
    y = "Proportion correct",
    title = "Accuracy per person/condition"
  ) +
  theme_linedraw(base_size = 28) +
  facet_wrap(~ID)
Über Versuchsperson aggregieren
Ein Exkurs über Within-person Standardfehler
dfl <- dfw |>
    pivot_longer(contains("test"),
                 names_to = "condition",
                 values_to = "value") |>
    mutate(condition = as_factor(condition))dflsum <- dfl |>
    Rmisc::summarySEwithin(measurevar = "value",
                               withinvars = "condition",
                               idvar = "subject",
                               na.rm = FALSE,
                               conf.interval = 0.95)dflsum |>
    ggplot(aes(x = condition, y = value, group = 1)) +
    geom_line() +
    geom_errorbar(width = 0.1, aes(ymin = value-ci, ymax = value+ci)) +
    geom_point(shape = 21, size = 3, fill = "white") +
    ylim(40,60)
# Plot the individuals
dfl |>
    ggplot(aes(x=condition, y=value, colour=subject, group=subject)) +
    geom_line() + geom_point(shape=21, fill="white") +
    ylim(ymin,ymax)
dfNorm_long <- Rmisc::normDataWithin(data=dfl, idvar="subject", measurevar="value")
?Rmisc::normDataWithin
dfNorm_long |>
    ggplot(aes(x=condition, y=valueNormed, colour=subject, group=subject)) +
    geom_line() + geom_point(shape=21, fill="white") +
    ylim(ymin,ymax)
# Instead of summarySEwithin, use summarySE, which treats condition as though it were a between-subjects variable
dflsum_between <- Rmisc::summarySE(data = dfl, 
                                   measurevar = "value", 
                                   groupvars = "condition", 
                                   na.rm = FALSE, 
                                   conf.interval = .95)
dflsum_between  condition  N value       sd       se       ci
1   pretest 10 47.74 8.598992 2.719240 6.151348
2  posttest 10 51.43 7.253972 2.293907 5.189179
# Show the between-S CI's in red, and the within-S CI's in black
dflsum_between |>
    ggplot(aes(x=condition, y=value, group=1)) +
    geom_line() +
    geom_errorbar(width=.1, aes(ymin=value-ci, ymax=value+ci), colour="red") +
    geom_errorbar(width=.1, aes(ymin=value-ci, ymax=value+ci), data=dflsum) +
    geom_point(shape=21, size=3, fill="white") +
    ylim(ymin,ymax)
Within-person Standardfehler
accuracy |> 
  ggplot(aes(x = condition, y = accuracy, colour = ID, group = ID)) +
    geom_line() + 
  geom_point(shape=21, fill="white")
Der Standardfehler is definiert als: \[SE = sd/ \sqrt{n}\]
Leider gibt es in R keine Funktion, welche den Standardfehler berechnet (schätzt); wir können aber ganz einfach selber eine Funktion definieren.
datasum <- data |>
   group_by(condition) |> 
   summarise(N = n(),
             ccuracy = mean(correct),
             sd = sd(correct),
             se = se(correct))
datasum# A tibble: 3 × 5
  condition     N ccuracy    sd     se
  <fct>     <int>   <dbl> <dbl>  <dbl>
1 invalid     144   0.389 0.489 0.0408
2 neutral     720   0.629 0.483 0.0180
3 valid       576   0.825 0.381 0.0159
datasum_2 <- data |>
    Rmisc::summarySE(measurevar = "correct",
                              groupvars = "condition",
                               na.rm = FALSE,
                               conf.interval = 0.95)
datasum_2  condition   N   correct        sd         se         ci
1   invalid 144 0.3888889 0.4891996 0.04076663 0.08058308
2   neutral 720 0.6291667 0.4833637 0.01801390 0.03536613
3     valid 576 0.8246528 0.3805943 0.01585810 0.03114686
datasum_3 <- data |>
    Rmisc::summarySEwithin(measurevar = "correct",
                               withinvars = "condition",
                               idvar = "ID",
                               na.rm = FALSE,
                               conf.interval = 0.95)
datasum_3  condition   N   correct        sd         se         ci
1   invalid 144 0.3888889 0.5773528 0.04811273 0.09510406
2   neutral 720 0.6291667 0.5726512 0.02134145 0.04189901
3     valid 576 0.8246528 0.4523391 0.01884746 0.03701827
p_accuracy <- datasum_3 |>
    ggplot(aes(x = condition, y = correct, group = 1)) +
    geom_line() +
    geom_errorbar(width = .1, aes(ymin = correct-se, ymax = correct+se), colour="red") +
    geom_point(shape=21, size=3, fill="white")
p_accuracy
Reaktionszeiten
Pro Versuchsperson
Wir fassen die Daten pro Person pro Block mit Mittelwert, Median und Standarabweichung zusammen.
by_subj # A tibble: 27 × 5
# Groups:   ID [9]
   ID    condition  mean median     sd
   <fct> <fct>     <dbl>  <dbl>  <dbl>
 1 JH    invalid   0.775  0.739 0.163 
 2 JH    neutral   0.799  0.733 0.202 
 3 JH    valid     0.696  0.658 0.190 
 4 NS    invalid   0.894  0.913 0.207 
 5 NS    neutral   0.885  0.844 0.201 
 6 NS    valid     0.738  0.715 0.191 
 7 rh    invalid   0.423  0.389 0.151 
 8 rh    neutral   0.525  0.503 0.0841
 9 rh    valid     0.443  0.390 0.185 
10 sb    invalid   0.376  0.341 0.0924
# … with 17 more rows
Einfachere Version:
by_subj |> 
  ggplot(aes(x = condition, y = mean, fill = condition)) +
  geom_col() +
  geom_line(aes(group = ID), size = 2) +
  geom_point(size = 8) +
  scale_fill_manual(
    values = c(invalid = "#9E0142",
    neutral = "#C4C4B7",
    valid = "#2EC762")
  ) +
  labs(
    x = "Cue",
    y = "Response time") +
  theme_linedraw(base_size = 28) +
  facet_wrap(~ID)
by_subj |> 
  ggplot(aes(condition, mean)) +
  geom_line(aes(group = 1), linetype = 3) +    
  geom_errorbar(aes(ymin = mean-se, ymax = mean+se),
                width = 0.2, size=1, color="blue") +
  geom_point(size = 2) +
  facet_wrap(~ID, scales = "free_y")
Über Versuchsperson aggregieren
rtsum <- data |>
  drop_na(rt) |> 
    Rmisc::summarySEwithin(measurevar = "rt",
                               withinvars = "condition",
                               idvar = "ID",
                               na.rm = FALSE,
                               conf.interval = 0.95)
rtsum  condition   N        rt        sd         se         ci
1   invalid 141 0.7055247 0.2204498 0.01856522 0.03670444
2   neutral 710 0.7238269 0.2449543 0.00919297 0.01804870
3     valid 568 0.6716487 0.2482698 0.01041717 0.02046095
p_rt <- rtsum |>
    ggplot(aes(x = condition, y = rt, group = 1)) +
    geom_line() +
    geom_errorbar(width = .1, aes(ymin = rt-se, ymax = rt+se), colour="red") +
    geom_point(shape=21, size=3, fill="white")p_rt
p_accuracy / p_rt
Reuse
Citation
BibTeX citation:
@online{ellis2022,
  author = {Andrew Ellis},
  title = {Daten Bearbeiten Und Zusammenfassen},
  date = {2022-03-15},
  url = {https://kogpsy.github.io/neuroscicomplabFS22//pages/chapters/04_summarizing_data.html},
  langid = {en}
}
For attribution, please cite this work as:
Andrew Ellis. 2022. “Daten Bearbeiten Und Zusammenfassen.”
March 15, 2022. https://kogpsy.github.io/neuroscicomplabFS22//pages/chapters/04_summarizing_data.html.