A list is R's most flexible data structure - it's like a container that can hold different types of objects (vectors, matrices, data frames, even other lists) all together. Unlike vectors, matrices, or data frames that require elements to be the same type or have the same structure, lists can contain completely different kinds of data.
Key characteristics:
Scientific analogy: A list is like a laboratory filing system where each folder can contain completely different things - one folder might have data tables, another might have graphs, another might have text notes, and another might contain sub-folders with more materials.
# Simple list with different data types
experiment_results <- list(
sample_size = 50,
treatment_groups = c("Control", "Treatment_A", "Treatment_B"),
measurements = c(23.5, 45.2, 67.8, 34.1, 56.7),
experiment_date = as.Date("2024-03-15"),
significant = TRUE
)
experiment_results
[[1]]
[1] 50
[[2]]
[1] "Control" "Treatment_A" "Treatment_B"
[[3]]
[1] 23.5 45.2 67.8 34.1 56.7
[[4]]
[1] "2024-03-15"
[[5]]
[1] TRUE# Same list with names
experiment_results <- list(
sample_size = 50,
treatment_groups = c("Control", "Treatment_A", "Treatment_B"),
measurements = c(23.5, 45.2, 67.8, 34.1, 56.7),
experiment_date = as.Date("2024-03-15"),
significant = TRUE
)
experiment_results
$sample_size
[1] 50
$treatment_groups
[1] "Control" "Treatment_A" "Treatment_B"
$measurements
[1] 23.5 45.2 67.8 34.1 56.7
$experiment_date
[1] "2024-03-15"
$significant
[1] TRUE# Research project containing various data types
research_project <- list(
# Basic information
project_info = c("Climate Study", "Dr. Smith", "2024"),
# Data frame
field_data = data.frame(
site_id = 1:5,
temperature = c(18.2, 22.5, 19.8, 21.3, 20.1),
species_count = c(12, 18, 15, 20, 16)
),
# Matrix
correlation_matrix = matrix(c(1.0, 0.7, 0.3,
0.7, 1.0, 0.5,
0.3, 0.5, 1.0), nrow = 3),
# Vector
collection_dates = as.Date(c("2024-01-15", "2024-02-15", "2024-03-15")),
# Factor
habitat_types = factor(c("Forest", "Grassland", "Wetland"))
)
str(research_project)
List of 5
$ project_info : chr [1:3] "Climate Study" "Dr. Smith" "2024"
$ field_data :'data.frame': 5 obs. of 3 variables:
..$ site_id : int [1:5] 1 2 3 4 5
..$ temperature : num [1:5] 18.2 22.5 19.8 21.3 20.1
..$ species_count: num [1:5] 12 18 15 20 16
$ correlation_matrix: num [1:3, 1:3] 1 0.7 0.3 0.7 1 0.5 0.3 0.5 1
$ collection_dates : Date[1:3], format: "2024-01-15" "2024-02-15" "2024-03-15"
$ habitat_types : Factor w/ 3 levels "Forest","Grassland",..: 1 2 3# Access using $ notation
experiment_results$sample_size
[1] 50
experiment_results$treatment_groups
[1] "Control" "Treatment_A" "Treatment_B"
# Access using double square brackets
experiment_results[["measurements"]]
[1] 23.5 45.2 67.8 34.1 56.7
experiment_results[["experiment_date"]]
[1] "2024-03-15"# Access first element
experiment_results[[1]]
[1] 50
# Access third element
experiment_results[[3]]
[1] 23.5 45.2 67.8 34.1 56.7
# Multiple elements (returns a list)
experiment_results[1:2]
$sample_size
[1] 50
$treatment_groups
[1] "Control" "Treatment_A" "Treatment_B"# Single brackets return a list
experiment_results[1]
$sample_size
[1] 50
class(experiment_results[1])
[1] "list"
# Double brackets return the actual element
experiment_results[[1]]
[1] 50
class(experiment_results[[1]])
[1] "numeric"# Access data frame column within the list
research_project$field_data$temperature
[1] 18.2 22.5 19.8 21.3 20.1
# Access specific matrix element
research_project$correlation_matrix[2, 3]
[1] 0.5
# Multiple levels of access
research_project[["field_data"]][["site_id"]]
[1] 1 2 3 4 5# Add a new element
experiment_results$p_value <- 0.003
experiment_results$notes <- "Excellent response to treatment"
# Add using double brackets
experiment_results[["replication"]] <- 3
# View updated list
names(experiment_results)
[1] "sample_size" "treatment_groups" "measurements" "experiment_date"
[5] "significant" "p_value" "notes" "replication"# Change a value
experiment_results$sample_size <- 75
# Modify part of a vector
experiment_results$treatment_groups[2] <- "Treatment_Modified"
# Replace entire element
experiment_results$measurements <- c(25.1, 47.3, 68.9, 36.2, 58.1, 42.7)# Remove using NULL
experiment_results$notes <- NULL
# Remove using negative indexing
experiment_results <- experiment_results[-7] # Remove 7th element
# Remove multiple elements
experiment_results[c("p_value", "replication")] <- NULLgene_study <- list(
# Metadata
study_info = list(
title = "Gene Expression in Heat Stress",
pi_name = "Dr. Johnson",
species = "Arabidopsis thaliana",
n_samples = 48
),
# Raw data
expression_data = matrix(rnorm(1000), nrow = 100, ncol = 10),
# Sample information
sample_metadata = data.frame(
sample_id = paste0("S", 1:10),
treatment = rep(c("Control", "Heat_Stress"), each = 5),
timepoint = rep(c("0h", "2h", "6h", "12h", "24h"), 2),
batch = rep(c("Batch1", "Batch2"), c(6, 4))
),
# Analysis results
differential_genes = data.frame(
gene_id = paste0("Gene_", 1:50),
log_fold_change = rnorm(50, mean = 0, sd = 2),
p_value = runif(50, 0, 0.05),
significant = sample(c(TRUE, FALSE), 50, replace = TRUE)
),
# Statistical summaries
stats = list(
total_genes_tested = 15000,
significant_genes = 1250,
upregulated = 675,
downregulated = 575
)
)
# Access nested information
gene_study$study_info$species
[1] "Arabidopsis thaliana"
gene_study$stats$significant_genes
[1] 1250
# Access data frame within list
head(gene_study$sample_metadata)
sample_id treatment timepoint batch
1 S1 Control 0h Batch1
2 S2 Control 2h Batch1
3 S3 Control 6h Batch1
4 S4 Control 12h Batch1
5 S5 Control 24h Batch1
6 S6 Heat_Stress 0h Batch1biodiversity_survey <- list(
# Site information
sites = data.frame(
site_id = paste0("Site_", LETTERS[1:8]),
latitude = runif(8, 40.1, 40.9),
longitude = runif(8, -74.2, -73.8),
habitat_type = factor(rep(c("Forest", "Grassland"), each = 4)),
elevation_m = sample(100:500, 8)
),
# Species data for each site
species_data = list(
Site_A = data.frame(
species = c("Quercus alba", "Acer rubrum", "Pinus strobus"),
abundance = c(15, 8, 12),
dbh_cm = c(45.2, 32.1, 38.7)
),
Site_B = data.frame(
species = c("Quercus alba", "Betula papyrifera"),
abundance = c(22, 6),
dbh_cm = c(52.3, 28.9)
)
# ... would continue for all sites
),
# Environmental measurements
environmental = array(
data = rnorm(96), # 8 sites × 4 variables × 3 seasons
dim = c(8, 4, 3),
dimnames = list(
Sites = paste0("Site_", LETTERS[1:8]),
Variables = c("Temperature", "Humidity", "SoilpH", "Light"),
Seasons = c("Spring", "Summer", "Fall")
)
),
# Survey metadata
survey_info = list(
dates = as.Date(c("2024-04-15", "2024-07-20", "2024-10-10")),
observers = c("Smith, J.", "Johnson, M.", "Brown, K."),
weather_conditions = c("Clear", "Partly cloudy", "Overcast")
)
)
# Access species data for specific site
biodiversity_survey$species_data$Site_A
species abundance dbh_cm
1 Quercus alba 15 45.2
2 Acer rubrum 8 32.1
3 Pinus strobus 12 38.7
# Access environmental data
biodiversity_survey$environmental[1, , "Spring"] # Site A, all variables, Spring
Temperature Humidity SoilpH Light
0.5855288 0.7094660 -0.1093033 -0.4534972clinical_trial <- list(
# Trial design
protocol = list(
study_name = "Hypertension Treatment Study",
phase = "Phase III",
primary_endpoint = "Systolic BP reduction",
duration_weeks = 12,
target_enrollment = 300
),
# Patient data
patients = data.frame(
patient_id = paste0("P", 1001:1050),
age = sample(30:75, 50, replace = TRUE),
sex = sample(c("M", "F"), 50, replace = TRUE),
treatment_group = rep(c("Placebo", "Drug_5mg", "Drug_10mg"), length.out = 50),
baseline_sbp = rnorm(50, mean = 160, sd = 15)
),
# Measurements over time
longitudinal_data = array(
data = rnorm(600), # 50 patients × 4 visits × 3 measurements
dim = c(50, 4, 3),
dimnames = list(
Patients = paste0("P", 1001:1050),
Visits = c("Baseline", "Week4", "Week8", "Week12"),
Measures = c("SystolicBP", "DiastolicBP", "HeartRate")
)
),
# Adverse events
adverse_events = data.frame(
patient_id = sample(paste0("P", 1001:1050), 15),
event_type = sample(c("Headache", "Dizziness", "Nausea"), 15, replace = TRUE),
severity = factor(sample(c("Mild", "Moderate", "Severe"), 15, replace = TRUE)),
onset_day = sample(1:84, 15)
),
# Analysis results
results = list(
primary_analysis = list(
placebo_reduction = -2.3,
drug_5mg_reduction = -8.7,
drug_10mg_reduction = -12.4,
p_value = 0.001
),
safety_summary = list(
total_aes = 15,
serious_aes = 2,
discontinuations = 3
)
)
)
# Access patient data
head(clinical_trial$patients, 3)
patient_id age sex treatment_group baseline_sbp
1 P1001 44 M Placebo 162.6324
2 P1002 70 F Drug_5mg 144.1853
3 P1003 58 M Drug_10mg 172.3091
# Access nested results
clinical_trial$results$primary_analysis$p_value
[1] 0.001
# Access longitudinal measurements for one patient
clinical_trial$longitudinal_data["P1001", , ]
Measures
Visits SystolicBP DiastolicBP HeartRate
Baseline -1.8471897 0.10803133 -0.04168430
Week4 0.7641131 -0.46092008 0.01727168
Week8 2.1329982 -0.93260298 0.20073019
Week12 -1.5234584 0.08266733 -0.99723668# Good for lists: Mixed data types and structures
mixed_analysis <- list(
raw_data = data.frame(x = 1:10, y = rnorm(10)),
model = lm(y ~ x, data = data.frame(x = 1:10, y = rnorm(10))),
plots = "plot_filename.png",
parameters = c(intercept = 2.3, slope = 0.8),
significance = TRUE
)
# NOT good for lists: Homogeneous data (use data.frame instead)
# DON'T do this:
bad_list <- list(
sample1 = c(temp = 20, humidity = 65),
sample2 = c(temp = 22, humidity = 68),
sample3 = c(temp = 18, humidity = 62)
)
# DO this instead:
good_dataframe <- data.frame(
sample = c("sample1", "sample2", "sample3"),
temperature = c(20, 22, 18),
humidity = c(65, 68, 62)
)# Create list of datasets
datasets <- list(
experiment1 = data.frame(values = rnorm(20, mean = 5)),
experiment2 = data.frame(values = rnorm(30, mean = 8)),
experiment3 = data.frame(values = rnorm(25, mean = 6))
)
# Apply function to each list element
means <- lapply(datasets, function(x) mean(x$values))
means
$experiment1
[1] 4.982071
$experiment2
[1] 7.936441
$experiment3
[1] 5.943856
# Simplify to vector if possible
unlist(means)
experiment1 experiment2 experiment3
4.982071 7.936441 5.943856
# Use sapply for direct vector output
sapply(datasets, function(x) mean(x$values))
experiment1 experiment2 experiment3
4.982071 7.936441 5.943856# Two separate studies
study1 <- list(
participants = 50,
treatment = "Drug_A",
results = c(12, 15, 18, 14, 16)
)
study2 <- list(
participants = 45,
treatment = "Drug_B",
results = c(8, 11, 13, 10, 12)
)
# Combine into meta-analysis
meta_analysis <- list(
study1 = study1,
study2 = study2,
combined_results = c(study1$results, study2$results)
)
# Access nested data
meta_analysis$study1$participants
[1] 50
meta_analysis$combined_results
[1] 12 15 18 14 16 8 11 13 10 12# List to data frame (when structure allows)
simple_list <- list(
temperature = c(20, 22, 18),
humidity = c(65, 68, 62),
pressure = c(1013, 1015, 1010)
)
# Convert to data frame
df_from_list <- data.frame(simple_list)
df_from_list
temperature humidity pressure
1 20 65 1013
2 22 68 1015
3 18 62 1010
# Data frame to list (columns become list elements)
list_from_df <- as.list(df_from_list)
str(list_from_df)
List of 3
$ temperature: num [1:3] 20 22 18
$ humidity : num [1:3] 65 68 62
$ pressure : num [1:3] 1013 1015 1010# List names
names(experiment_results)
[1] "sample_size" "treatment_groups" "measurements" "experiment_date" "significant"
# List length (number of elements)
length(experiment_results)
[1] 5
# Check if object is a list
is.list(experiment_results)
[1] TRUE
# Structure of list
str(experiment_results)
List of 5
$ sample_size : num 75
$ treatment_groups: chr [1:3] "Control" "Treatment_Modified" "Treatment_B"
$ measurements : num [1:6] 25.1 47.3 68.9 36.2 58.1 42.7
$ experiment_date : Date[1:1], format: "2024-03-15"
$ significant : logi TRUE# Get all names recursively
research_list <- list(
study1 = list(data = c(1, 2, 3), info = "preliminary"),
study2 = list(data = c(4, 5, 6), info = "final")
)
# Flatten list structure
unlist(research_list)
study1.data1 study1.data2 study1.data3 study1.info study2.data1 study2.data2
"1" "2" "3" "preliminary" "4" "5"
study2.data3 study2.info
"6" "final"
# Recursive application
rapply(research_list, function(x) if(is.numeric(x)) mean(x) else x)
$study1
$study1$data
[1] 2
$study1$info
[1] "preliminary"
$study2
$study2$data
[1] 5
$study2$info
[1] "final"# Function to store complete analysis results
analyze_experiment <- function(data) {
model <- lm(response ~ treatment, data = data)
list(
raw_data = data,
model_object = model,
summary = summary(model),
coefficients = coef(model),
fitted_values = fitted(model),
residuals = residuals(model),
r_squared = summary(model)$r.squared,
p_value = summary(model)$coefficients[2, 4],
significant = summary(model)$coefficients[2, 4] < 0.05,
diagnostic_plots = "Would store plot objects here"
)
}
# Example data
exp_data <- data.frame(
treatment = factor(rep(c("Control", "Treatment"), each = 10)),
response = c(rnorm(10, 20, 5), rnorm(10, 25, 5))
)
# Run analysis - everything stored in one object
analysis_results <- analyze_experiment(exp_data)
# Access any part of the analysis
analysis_results$p_value
analysis_results$significant# Store multiple related experiments
multi_experiment_study <- list(
experiment_A = list(
design = "Randomized controlled",
n_subjects = 50,
results = data.frame(
subject = 1:50,
treatment = sample(c("Control", "Drug"), 50, replace = TRUE),
outcome = rnorm(50, 100, 15)
)
),
experiment_B = list(
design = "Crossover",
n_subjects = 30,
results = data.frame(
subject = rep(1:30, 2),
period = rep(c("Period1", "Period2"), each = 30),
treatment = rep(c("Drug", "Placebo"), 30),
outcome = rnorm(60, 95, 12)
)
),
meta_info = list(
pi_name = "Dr. Research",
funding_source = "NIH Grant 123456",
publication_status = "In preparation"
)
)Lists are indispensable for complex scientific computing where you need to organize and access different types of related information efficiently. They're particularly useful for storing complete analysis results, managing multiple related datasets, and building flexible data analysis pipelines.