Data Exercise: Analysis of age, gender effects on night time sleep using a synthetic dataset

In this exercise, I chose to work with a dataset that simulates human sleep behavior, focusing on the circadian rhythm, the natural sleep cycle over 24 hours. The dataset includes information from different types of people, organized by age group (like children, adults, elders, and newborns) and gender (female and male), including also sleeping disorder (Regular, or Insomniac).

It records how much people sleep during what was defined by the nigh time hours (from 10 PM to 8 AM), and allows us to explore how age, gender, and whether someone is an insomniac or not affects the number of hours they sleep.

First step was to generate a synthetic dataset.

The code below simulates sleep patterns for different types of people. Each person was randomly assigned a group (“Normal” or “Insomniac”), a gender, and an age, which was then used to classify them into an age type (like “Child”, “Adult”, “Newborn”, or “Elder”).

The 24-hour day was split into 12 time bins, each 2 hours long (e.g., 00–02, 02–04, …, 22–24). For each person, the code calculated how much of their sleep happened in each bin, based on their bedtime and sleep duration.

library(dplyr)
Warning: package 'dplyr' was built under R version 4.4.3

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyr)
Warning: package 'tidyr' was built under R version 4.4.3
library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.4.3
set.seed(1)

# Define simulation parameters
n <- 100
groups <- c("Regular", "Insomniac")
bins <- seq(0, 24, by = 2)
labels <- paste0(sprintf("%02d", bins), "-", sprintf("%02d", (bins + 2) %% 24))

## AI Disclosure: this part below was created with the assistance of ChatGPT: 

# Simulate individuals
simulate_sleep <- function(group) {
  # Randomly assign age type
  age_type <- sample(c("Newborn", "Child", "Adult", "Elder"), 1, prob = c(0.1, 0.1, 0.7, 0.1))
  age <- switch(age_type,
                "Newborn" = round(runif(1, 0.1, 1.5), 2),
                "Child" = round(runif(1, 3, 12), 1),
                "Adult" = round(rnorm(1, 35, 10)),
                "Elder" = round(rnorm(1, 70, 5)))

  # Set sleep patterns based on Age_type so it can create a correlation. 
  if (age_type == "Newborn") {
    start <- rnorm(1, 20, 1)
    dur <- rnorm(1, 10, 1)
  } else if (age_type == "Child") {
    start <- rnorm(1, 21, 0.5)
    dur <- rnorm(1, 9, 0.7)
  } else if (group == "Regular") {
    start <- rnorm(1, 22.5, 0.5)
    dur <- rnorm(1, 7.5, 0.5)
  } else if (group == "Insomniac") {
    start <- rnorm(1, 2.0, 1.5)
    dur <- rnorm(1, 6, 1.5)
  }

  gender <- sample(c("Male", "Female"), 1)
  start <- start %% 24
  end <- (start + dur) %% 24

  return(data.frame(start, end, duration = dur, group, age, gender, Age_type = age_type))
}

# Simulate all individuals
sleep_df <- do.call(rbind, lapply(groups, function(g) {
  do.call(rbind, replicate(n, simulate_sleep(g), simplify = FALSE))
}))

# Allocate sleep into time bins
allocate_sleep <- function(start, end, duration) {
  hour_vec <- numeric(length(bins))
  for (i in seq_along(bins)) {
    bin_start <- bins[i]
    bin_end <- (bin_start + 2) %% 24
    in_bin <- function(t) {
      if (bin_start < bin_end) return(t >= bin_start & t < bin_end)
      else return(t >= bin_start | t < bin_end)
    }
    bin_time <- seq(0, duration, by = 0.25) + start
    bin_time <- bin_time %% 24
    hour_vec[i] <- mean(in_bin(bin_time), na.rm = TRUE) * duration
  }
  return(hour_vec)
}

# Bin sleep data
sleep_binned <- sleep_df %>%
  rowwise() %>%
  mutate(sleep_vec = list(allocate_sleep(start, end, duration))) %>%
  ungroup()

# Wide format for averaging
sleep_matrix <- sleep_binned %>%
  select(group, gender, age, Age_type, sleep_vec) %>%
  unnest_wider(sleep_vec, names_sep = "_") %>%
  rename_with(~labels, starts_with("sleep_vec"))

# Create summary table: average sleep per 2-hour bin by Age_type
sleep_summary <- sleep_matrix %>%
  group_by(Age_type) %>%
  summarise(across(all_of(labels), mean), .groups = "drop")

#Inspect how the data looks like:

str(sleep_matrix)
tibble [200 × 17] (S3: tbl_df/tbl/data.frame)
 $ group   : chr [1:200] "Regular" "Regular" "Regular" "Regular" ...
 $ gender  : chr [1:200] "Male" "Female" "Male" "Female" ...
 $ age     : num [1:200] 32 20 11.9 29 35 44 41 30 39 68 ...
 $ Age_type: chr [1:200] "Adult" "Adult" "Child" "Adult" ...
 $ 00-02   : num [1:200] 1.97 1.96 1.96 1.95 1.98 ...
 $ 02-04   : num [1:200] 1.97 1.96 1.96 1.95 1.98 ...
 $ 04-06   : num [1:200] 1.97 1.47 1.96 1.47 1.98 ...
 $ 06-08   : num [1:200] 1.233 0 0.981 0 0.742 ...
 $ 08-10   : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
 $ 10-12   : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
 $ 12-14   : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
 $ 14-16   : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
 $ 16-18   : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
 $ 18-20   : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
 $ 20-22   : num [1:200] 0 0 1.227 0.733 0 ...
 $ 22-00   : num [1:200] 0.986 1.961 1.963 1.955 1.236 ...
 $ 24-02   : num [1:200] 1.97 1.96 1.96 1.95 1.98 ...
summary(sleep_matrix)
    group              gender               age          Age_type        
 Length:200         Length:200         Min.   : 0.21   Length:200        
 Class :character   Class :character   1st Qu.:20.00   Class :character  
 Mode  :character   Mode  :character   Median :33.00   Mode  :character  
                                       Mean   :32.52                     
                                       3rd Qu.:45.00                     
                                       Max.   :77.00                     
     00-02            02-04           04-06           06-08       
 Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
 1st Qu.:0.7435   1st Qu.:1.936   1st Qu.:1.486   1st Qu.:0.0000  
 Median :1.9540   Median :1.959   Median :1.944   Median :0.2466  
 Mean   :1.4451   Mean   :1.768   Mean   :1.661   Mean   :0.6714  
 3rd Qu.:1.9756   3rd Qu.:1.978   3rd Qu.:1.972   3rd Qu.:1.2347  
 Max.   :1.9996   Max.   :2.000   Max.   :1.998   Max.   :1.9976  
     08-10            10-12             12-14              14-16       16-18  
 Min.   :0.0000   Min.   :0.00000   Min.   :0.000000   Min.   :0   Min.   :0  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0   1st Qu.:0  
 Median :0.0000   Median :0.00000   Median :0.000000   Median :0   Median :0  
 Mean   :0.2089   Mean   :0.04059   Mean   :0.002489   Mean   :0   Mean   :0  
 3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0   3rd Qu.:0  
 Max.   :1.9976   Max.   :1.99092   Max.   :0.497730   Max.   :0   Max.   :0  
     18-20             20-22            22-00           24-02       
 Min.   :0.00000   Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
 1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.7435  
 Median :0.00000   Median :0.0000   Median :1.240   Median :1.9540  
 Mean   :0.06315   Mean   :0.3257   Mean   :1.057   Mean   :1.4451  
 3rd Qu.:0.00000   3rd Qu.:0.2442   3rd Qu.:1.954   3rd Qu.:1.9756  
 Max.   :1.99245   Max.   :1.9976   Max.   :2.000   Max.   :1.9996  

Box Plot showing age of people that has insominia

ggplot(sleep_matrix, aes(x = group, y = age, fill = group)) +
  geom_boxplot(color = "black", alpha = 0.7) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Age Distribution by Group (Box Plot)",
       x = "Group", y = "Age (years)") +
  theme_minimal()

Box Plot of sleep time at night (from 22:00 to 08:00) and age_type (Adult, Child, Elder, Newborn):

# Step 1: Define nighttime bins
night_bins <- c("22-00", "00-02", "02-04", "04-06", "06-08")

# Step 2: Calculate total nighttime sleep per person
sleep_night <- sleep_matrix %>%
  mutate(Night_sleep = rowSums(select(., all_of(night_bins)), na.rm = TRUE))

# Step 3: Plot boxplot of nighttime sleep by Age_type
ggplot(sleep_night, aes(x = Age_type, y = Night_sleep, fill = Age_type)) +
  geom_boxplot(color = "black", alpha = 0.7) +
  scale_fill_brewer(palette = "Set3") +
  labs(title = "Nighttime Sleep Duration (22:00–08:00) by Age Type",
       x = "Age Type", y = "Total Sleep (hrs)") +
  theme_minimal()

Circular plot of sleep time per category:

# Convert to long format for circular plot
sleep_long <- sleep_summary %>%
  pivot_longer(-Age_type, names_to = "time_bin", values_to = "sleep_amount") %>%
  mutate(
    bin_start = as.numeric(substr(time_bin, 1, 2)),
    bin_label = paste0(bin_start, ":00"),
    bin_angle = 2 * pi * bin_start / 24
  )

# Plot circular behavior by Age_type
ggplot(sleep_long, aes(x = factor(bin_start), y = sleep_amount, fill = Age_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 1, color = "black") +
  coord_polar(start = pi/2) +
  scale_x_discrete(
    labels = function(x) paste0(x, ":00"),
    breaks = as.character(bins)
  ) +
  labs(title = "Circular sleep behavior by age type",
       x = "Time of Day", y = "Avg Hours Asleep (per 2h bin)") +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal() +
  theme(axis.text.y = element_blank(),
        panel.grid = element_blank(),
        axis.title.y = element_text(margin = margin(r = 10)))

Check if the variables predict sleep time:

# Model 1: Predict nighttime sleep using Age_type
fit1 <- lm(Night_sleep ~ Age_type, data = sleep_night)
summary(fit1)

Call:
lm(formula = Night_sleep ~ Age_type, data = sleep_night)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.8659 -0.9613  0.3760  1.1427  2.2386 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      6.34773    0.11994  52.925  < 2e-16 ***
Age_typeChild    1.77574    0.32898   5.398 1.94e-07 ***
Age_typeElder   -0.07072    0.31634  -0.224    0.823    
Age_typeNewborn  0.81264    0.34367   2.365    0.019 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.404 on 196 degrees of freedom
Multiple R-squared:  0.1456,    Adjusted R-squared:  0.1325 
F-statistic: 11.13 on 3 and 196 DF,  p-value: 8.845e-07
lm(formula = Night_sleep ~ Age_type, data = sleep_night)

Call:
lm(formula = Night_sleep ~ Age_type, data = sleep_night)

Coefficients:
    (Intercept)    Age_typeChild    Age_typeElder  Age_typeNewborn  
        6.34773          1.77574         -0.07072          0.81264  

The model shows that Age_type significantly predicts night time sleep, with Children and Newborns sleeping significantly more than Adults. Elders do not differ significantly from Adults, and the model explains about 14.6% of the variance in sleep duration (R² = 0.1456).

# Model 2: Predict nighttime sleep using gender
fit2 <- lm(Night_sleep ~ gender, data = sleep_night)
summary(fit2)

Call:
lm(formula = Night_sleep ~ gender, data = sleep_night)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.0760 -1.0078  0.4585  1.0249  3.3177 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   6.6396     0.1434  46.311   <2e-16 ***
genderMale   -0.0818     0.2149  -0.381    0.704    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.511 on 198 degrees of freedom
Multiple R-squared:  0.000731,  Adjusted R-squared:  -0.004316 
F-statistic: 0.1448 on 1 and 198 DF,  p-value: 0.7039
lm(formula = Night_sleep ~ gender, data = sleep_night)

Call:
lm(formula = Night_sleep ~ gender, data = sleep_night)

Coefficients:
(Intercept)   genderMale  
     6.6396      -0.0818  

The model shows that gender does not significantly predict night time sleep duration. Difference that is not statistically significant (p = 0.704).

# Model 3: Predict night time sleep using age
fit3 <- lm(Night_sleep ~ age, data = sleep_night)
summary(fit3)

Call:
lm(formula = Night_sleep ~ age, data = sleep_night)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.0604 -1.0861  0.3997  1.0539  2.8303 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.173229   0.199109  36.027  < 2e-16 ***
age         -0.017527   0.005222  -3.356 0.000947 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.47 on 198 degrees of freedom
Multiple R-squared:  0.05383,   Adjusted R-squared:  0.04905 
F-statistic: 11.26 on 1 and 198 DF,  p-value: 0.0009475
lm(formula = Night_sleep ~ age, data = sleep_night)

Call:
lm(formula = Night_sleep ~ age, data = sleep_night)

Coefficients:
(Intercept)          age  
    7.17323     -0.01753  

The model indicates that age is a statistically significant predictor of night time sleep, with each additional year of age associated with a decrease of ~0.018 hours (just over 1 minute) in sleep duration (p = 0.00095). However, the effect size is small and the model explains only ~5.4% of the variance in night time sleep (R-squared: 0.05383).

ggplot(sleep_night, aes(x = Night_sleep, y = age)) +
  geom_point(color = "black", size = 1.8) +
  geom_smooth(method = "lm", color = "black", se = FALSE, linewidth = 0.8) +
  labs(title = "Age vs Night time sleep",
       x = "Night time sleep (hrs, 22:00–08:00)",
       y = "Age (years)") +
  theme_minimal(base_size = 12) 
`geom_smooth()` using formula = 'y ~ x'

# Model 4: Predict nighttime sleep using group (insomnia status)
fit4 <- lm(Night_sleep ~ group, data = sleep_night)
summary(fit4)

Call:
lm(formula = Night_sleep ~ group, data = sleep_night)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1988 -0.5934 -0.0280  0.5262  3.4674 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)    5.6806     0.1193   47.61   <2e-16 ***
groupRegular   1.8453     0.1687   10.94   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.193 on 198 degrees of freedom
Multiple R-squared:  0.3766,    Adjusted R-squared:  0.3734 
F-statistic: 119.6 on 1 and 198 DF,  p-value: < 2.2e-16
lm(formula = Night_sleep ~ group, data = sleep_night)

Call:
lm(formula = Night_sleep ~ group, data = sleep_night)

Coefficients:
 (Intercept)  groupRegular  
       5.681         1.845  

This model shows that group is a strong and highly significant predictor of sleep duration (p < 2e-16). On average, individuals in the Regular (Normal) group sleep 1.85 hours more than those in the Insomniac group.

ggplot(sleep_night, aes(x = group, y = Night_sleep)) +
  geom_jitter(width = 0.2, height = 0, color = "black", size = 1.8, alpha = 0.6) +
  stat_summary(fun = mean, geom = "crossbar", width = 0.3, color = "black", fatten = 0.8) +
  labs(title = "Night time sleep duration by group",
       x = "Group", y = "Nighttime Sleep (hrs)") +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.text = element_text(color = "black"),
    axis.title = element_text(color = "black"),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )
Warning: The `fatten` argument of `geom_crossbar()` is deprecated as of ggplot2 4.0.0.
ℹ Please use the `middle.linewidth` argument instead.