A data frame is R's most important data structure for scientific data analysis. It's a two-dimensional table where each column can contain different types of data (numbers, text, dates, etc.), while each row represents a single observation or case. Think of it as a spreadsheet designed specifically for scientific research.
Key characteristics:
Real-world analogy: A data frame is like a lab notebook page where each row is an experimental observation and each column is a different measurement or characteristic you recorded.
data.frame() function:# Climate research data
climate_data <- data.frame(
station_id = c("A01", "B02", "C03", "D04", "E05"),
temperature = c(23.5, 18.2, 31.0, 15.8, 27.3),
humidity = c(68, 45, 82, 55, 71),
precipitation = c(12.5, 0.0, 45.2, 8.1, 22.8),
wind_speed = c(15.2, 22.1, 8.5, 18.7, 11.3),
is_coastal = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)
climate_data
station_id temperature humidity precipitation wind_speed is_coastal
1 A01 23.5 68 12.5 15.2 TRUE
2 B02 18.2 45 0.0 22.1 FALSE
3 C03 31.0 82 45.2 8.5 TRUE
4 D04 15.8 55 8.1 18.7 FALSE
5 E05 27.3 71 22.8 11.3 TRUE# Plant growth experiment
plant_experiment <- data.frame(
plant_id = paste0("P", 1:8),
species = rep(c("Arabidopsis", "Tomato"), each = 4),
treatment = rep(c("Control", "Fertilizer", "Control", "Fertilizer"), 2),
initial_height_cm = c(2.1, 2.3, 2.0, 2.2, 15.2, 14.8, 15.5, 15.0),
final_height_cm = c(8.5, 12.3, 7.8, 11.9, 45.2, 52.1, 43.8, 50.5),
days_to_flower = c(28, 25, 30, 26, 65, 58, 67, 60),
survived = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE)
)
plant_experiment
plant_id species treatment initial_height_cm final_height_cm days_to_flower survived
1 P1 Arabidopsis Control 2.1 8.5 28 TRUE
2 P2 Arabidopsis Fertilizer 2.3 12.3 25 TRUE
3 P3 Arabidopsis Control 2.0 7.8 30 TRUE
4 P4 Arabidopsis Fertilizer 2.2 11.9 26 TRUE
5 P5 Tomato Control 15.2 45.2 65 TRUE
6 P6 Tomato Fertilizer 14.8 52.1 58 TRUE
7 P7 Tomato Control 15.5 43.8 67 FALSE
8 P8 Tomato Fertilizer 15.0 50.5 60 TRUE# These functions create data frames automatically
# bird_data <- read.csv("bird_observations.csv")
# gene_data <- read.table("gene_expression.txt", header = TRUE)
# climate_data <- read.delim("weather_station.txt")# Check if it's a data frame
is.data.frame(climate_data)
[1] TRUE
# Get dimensions (rows, columns)
dim(climate_data)
[1] 5 6
nrow(climate_data) # Number of rows (observations)
[1] 5
ncol(climate_data) # Number of columns (variables)
[1] 6
# Get column names (variable names)
names(climate_data)
[1] "station_id" "temperature" "humidity" "precipitation" "wind_speed" "is_coastal"
colnames(climate_data) # Same as names()
[1] "station_id" "temperature" "humidity" "precipitation" "wind_speed" "is_coastal"
# Get row names
rownames(climate_data)
[1] "1" "2" "3" "4" "5"# Get structure - very useful for understanding your data
str(climate_data)
'data.frame': 5 obs. of 6 variables:
$ station_id : chr "A01" "B02" "C03" "D04" ...
$ temperature : num 23.5 18.2 31 15.8 27.3
$ humidity : num 68 45 82 55 71
$ precipitation: num 12.5 0 45.2 8.1 22.8
$ wind_speed : num 15.2 22.1 8.5 18.7 11.3
$ is_coastal : logi TRUE FALSE TRUE FALSE TRUE
# Summary statistics
summary(climate_data)
station_id temperature humidity precipitation wind_speed
Length:5 Min. :15.80 Min. :45.0 Min. : 0.00 Min. : 8.50
Class :character 1st Qu.:18.20 1st Qu.:55.0 1st Qu.: 8.10 1st Qu.:11.30
Mode :character Median :23.50 Median :68.0 Median :12.50 Median :15.20
Mean :23.16 Mean :64.2 Mean :17.72 Mean :15.16
3rd Qu.:27.30 3rd Qu.:71.0 3rd Qu.:22.80 3rd Qu.:18.70
Max. :31.00 Max. :82.0 Max. :45.20 Max. :22.10
is_coastal
Mode :logical
FALSE:2
TRUE :3# Method 1: Using $ notation (most common)
climate_data$temperature
[1] 23.5 18.2 31.0 15.8 27.3
climate_data$station_id
[1] "A01" "B02" "C03" "D04" "E05"
# Method 2: Using square brackets with column names
climate_data[, "temperature"]
[1] 23.5 18.2 31.0 15.8 27.3
climate_data[["temperature"]] # Double brackets return vector
[1] 23.5 18.2 31.0 15.8 27.3
# Method 3: Using column numbers
climate_data[, 2] # Second column (temperature)
[1] 23.5 18.2 31.0 15.8 27.3
# Multiple columns
climate_data[, c("temperature", "humidity")]
temperature humidity
1 23.5 68
2 18.2 45
3 31.0 82
4 15.8 55
5 27.3 71# Single row
climate_data[3, ] # Third observation (all variables)
station_id temperature humidity precipitation wind_speed is_coastal
3 C03 31 82 45.2 8.5 TRUE
# Multiple rows
climate_data[c(1, 3, 5), ] # Rows 1, 3, and 5
station_id temperature humidity precipitation wind_speed is_coastal
1 A01 23.5 68 12.5 15.2 TRUE
3 C03 31.0 82 45.2 8.5 TRUE
5 E05 27.3 71 22.8 11.3 TRUE
# Row range
climate_data[2:4, ] # Rows 2 through 4
station_id temperature humidity precipitation wind_speed is_coastal
2 B02 18.2 45 0.0 22.1 FALSE
3 C03 31.0 82 45.2 8.5 TRUE
4 D04 15.8 55 8.1 18.7 FALSE# Single cell: row 2, column 3
climate_data[2, 3]
[1] 45
# Single cell using names
climate_data[2, "humidity"]
[1] 45
# Multiple cells
climate_data[c(1, 3), c("temperature", "precipitation")]
temperature precipitation
1 23.5 12.5
3 31.0 45.2# Stations with high temperature (> 25°C)
hot_stations <- climate_data[climate_data$temperature > 25, ]
hot_stations
station_id temperature humidity precipitation wind_speed is_coastal
3 C03 31 82 45.2 8.5 TRUE
5 E05 27.3 71 22.8 11.3 TRUE
# Coastal stations only
coastal_data <- climate_data[climate_data$is_coastal == TRUE, ]
coastal_data
station_id temperature humidity precipitation wind_speed is_coastal
1 A01 23.5 68 12.5 15.2 TRUE
3 C03 31.0 82 45.2 8.5 TRUE
5 E05 27.3 71 22.8 11.3 TRUE
# Multiple conditions: hot AND coastal
hot_coastal <- climate_data[climate_data$temperature > 25 & climate_data$is_coastal == TRUE, ]
hot_coastal
station_id temperature humidity precipitation wind_speed is_coastal
3 C03 31 82 45.2 8.5 TRUE
5 E05 27.3 71 22.8 11.3 TRUE
# Stations with significant precipitation (> 10mm)
wet_stations <- climate_data[climate_data$precipitation > 10, ]
wet_stations
station_id temperature humidity precipitation wind_speed is_coastal
1 A01 23.5 68 12.5 15.2 TRUE
3 C03 31.0 82 45.2 8.5 TRUE
5 E05 27.3 71 22.8 11.3 TRUEsubset() function (alternative approach):# Equivalent to above, but more readable
subset(climate_data, temperature > 25)
subset(climate_data, is_coastal == TRUE)
subset(climate_data, temperature > 25 & is_coastal == TRUE)
# Select specific columns while filtering
subset(climate_data, temperature > 20, select = c(station_id, temperature, humidity))
station_id temperature humidity
1 A01 23.5 68
3 C03 31.0 82
5 E05 27.3 71# Calculate growth rate for plant experiment
plant_experiment$growth_rate <- (plant_experiment$final_height_cm - plant_experiment$initial_height_cm) / plant_experiment$initial_height_cm
# Add temperature categories
climate_data$temp_category <- ifelse(climate_data$temperature > 25, "Hot",
ifelse(climate_data$temperature > 20, "Warm", "Cool"))
climate_data
station_id temperature humidity precipitation wind_speed is_coastal temp_category
1 A01 23.5 68 12.5 15.2 TRUE Warm
2 B02 18.2 45 0.0 22.1 FALSE Cool
3 C03 31.0 82 45.2 8.5 TRUE Hot
4 D04 15.8 55 8.1 18.7 FALSE Cool
5 E05 27.3 71 22.8 11.3 TRUE Hot
# Add new column with calculation
climate_data$comfort_index <- (climate_data$temperature * 0.7) + (climate_data$humidity * 0.3)# Correct a measurement error
climate_data[2, "temperature"] <- 18.7 # Fix station B02 temperature
# Update multiple values
climate_data$precipitation[climate_data$precipitation == 0] <- 0.1 # Replace 0 with 0.1# Add new weather station
new_station <- data.frame(
station_id = "F06",
temperature = 19.5,
humidity = 63,
precipitation = 15.2,
wind_speed = 12.8,
is_coastal = FALSE,
temp_category = "Cool",
comfort_index = (19.5 * 0.7) + (63 * 0.3)
)
climate_data <- rbind(climate_data, new_station)# Create data with missing values (common in real research)
field_data <- data.frame(
plot_id = 1:6,
soil_ph = c(6.2, 6.8, NA, 7.1, 6.5, NA),
nitrogen = c(45, 52, 38, NA, 48, 41),
plant_count = c(23, 31, 28, 25, NA, 29)
)
field_data
plot_id soil_ph nitrogen plant_count
1 1 6.2 45 23
2 2 6.8 52 31
3 3 NA 38 28
4 4 7.1 NA 25
5 5 6.5 48 NA
6 6 NA 41 29
# Check for missing values
is.na(field_data)
plot_id soil_ph nitrogen plant_count
[1,] FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE
[3,] FALSE TRUE FALSE FALSE
[4,] FALSE FALSE TRUE FALSE
[5,] FALSE FALSE FALSE TRUE
[6,] FALSE TRUE FALSE FALSE
# Count missing values per column
colSums(is.na(field_data))
plot_id soil_ph nitrogen plant_count
0 2 1 1
# Find complete cases (rows with no missing values)
complete.cases(field_data)
[1] TRUE TRUE FALSE FALSE FALSE TRUE
complete_data <- field_data[complete.cases(field_data), ]
complete_data
plot_id soil_ph nitrogen plant_count
1 1 6.2 45 23
2 2 6.8 52 31
6 6 NA 41 29# Remove rows with any missing values
clean_data <- na.omit(field_data)
# Calculate means ignoring missing values
mean(field_data$soil_ph, na.rm = TRUE)
[1] 6.65
# Replace missing values with mean
field_data$soil_ph[is.na(field_data$soil_ph)] <- mean(field_data$soil_ph, na.rm = TRUE)# Sort by temperature (ascending)
climate_sorted <- climate_data[order(climate_data$temperature), ]
climate_sorted
# Sort by temperature (descending)
climate_desc <- climate_data[order(-climate_data$temperature), ]
climate_desc
# Sort by precipitation (ascending)
climate_data[order(climate_data$precipitation), ]# Sort by species, then by treatment
plant_sorted <- plant_experiment[order(plant_experiment$species, plant_experiment$treatment), ]
plant_sorted# Summary of all variables
summary(climate_data)
# Apply functions to numeric columns only
numeric_cols <- sapply(climate_data, is.numeric)
sapply(climate_data[numeric_cols], mean, na.rm = TRUE)
sapply(climate_data[numeric_cols], sd, na.rm = TRUE)
# Quick data exploration
head(climate_data) # First 6 rows
tail(climate_data) # Last 6 rows
head(climate_data, 3) # First 3 rows# Calculate mean growth by species and treatment
aggregate(final_height_cm ~ species + treatment, data = plant_experiment, mean)
species treatment final_height_cm
1 Arabidopsis Control 8.150
2 Tomato Control 44.500
3 Arabidopsis Fertilizer 12.100
4 Tomato Fertilizer 51.300
# Count observations by group
table(plant_experiment$species, plant_experiment$treatment)
Control Fertilizer
Arabidopsis 2 2
Tomato 2 2# Bird observation data
bird_data <- data.frame(
observation_id = 1:12,
species = c("Robin", "Sparrow", "Robin", "Cardinal", "Sparrow", "Robin",
"Cardinal", "Robin", "Sparrow", "Cardinal", "Robin", "Sparrow"),
habitat = rep(c("Forest", "Grassland", "Urban"), each = 4),
count = c(3, 7, 2, 1, 12, 4, 2, 5, 15, 3, 1, 8),
date = as.Date(c("2024-05-01", "2024-05-01", "2024-05-02", "2024-05-02",
"2024-05-01", "2024-05-01", "2024-05-02", "2024-05-02",
"2024-05-01", "2024-05-01", "2024-05-02", "2024-05-02")),
temperature_c = c(18, 18, 20, 20, 22, 22, 24, 24, 16, 16, 18, 18)
)
# Analysis: Which habitat has highest bird diversity?
aggregate(count ~ habitat, data = bird_data, sum)
habitat count
1 Forest 13
2 Grassland 22
3 Urban 27# Clinical trial data
patient_data <- data.frame(
patient_id = paste0("P", 1001:1010),
age = c(45, 67, 34, 56, 78, 43, 62, 39, 51, 48),
sex = c("M", "F", "M", "F", "M", "F", "M", "F", "M", "F"),
treatment_group = rep(c("Placebo", "Drug_A"), each = 5),
baseline_bp = c(145, 160, 138, 152, 168, 142, 155, 140, 149, 158),
followup_bp = c(143, 145, 136, 140, 151, 141, 142, 138, 138, 145),
side_effects = c(0, 1, 0, 2, 1, 0, 3, 0, 1, 2)
)
# Calculate blood pressure reduction
patient_data$bp_reduction <- patient_data$baseline_bp - patient_data$followup_bp
# Compare treatment effectiveness
aggregate(bp_reduction ~ treatment_group, data = patient_data, mean)
treatment_group bp_reduction
1 Drug_A 13.4
2 Placebo 3.4$ notation for easy column access (df$column_name)[row, column] notationdf[df$variable > value, ])na.rm = TRUE in calculations, complete.cases() for filteringstr() and summary() to understand new dataData frames are the foundation of statistical analysis in R and are essential for organizing and analyzing scientific data effectively.