A factor is R's way of representing categorical data - variables that represent categories or groups rather than continuous measurements. In scientific research, factors are essential for representing experimental treatments, species classifications, measurement scales, or any variable with a fixed set of possible values.
Key characteristics:
Why factors matter in science:
# Experimental treatments
treatment <- factor(c("Control", "Low_dose", "High_dose", "Control", "Low_dose", "High_dose"))
treatment
[1] Control Low_dose High_dose Control Low_dose High_dose
Levels: Control High_dose Low_dose
# Species identification
species <- factor(c("Quercus_alba", "Pinus_strobus", "Acer_rubrum", "Quercus_alba", "Pinus_strobus"))
species
[1] Quercus_alba Pinus_strobus Acer_rubrum Quercus_alba Pinus_strobus
Levels: Acer_rubrum Pinus_strobus Quercus_alba
# Soil types in ecological study
soil_type <- factor(c("Clay", "Sand", "Loam", "Clay", "Sand", "Clay", "Loam"))
soil_type
[1] Clay Sand Loam Clay Sand Clay Loam
Levels: Clay Loam Sand# Specify all possible levels (even if not all are present in data)
treatment_levels <- factor(c("Control", "Low_dose", "High_dose"),
levels = c("Control", "Low_dose", "Medium_dose", "High_dose"))
treatment_levels
[1] Control Low_dose High_dose
Levels: Control Low_dose Medium_dose High_dose
# pH categories with logical order
ph_category <- factor(c("Acidic", "Neutral", "Basic", "Acidic", "Basic"),
levels = c("Acidic", "Neutral", "Basic"))
ph_category
[1] Acidic Neutral Basic Acidic Basic
Levels: Acidic Neutral Basic# Severity scale (has natural order)
disease_severity <- factor(c("Mild", "Severe", "Moderate", "Mild", "Severe"),
levels = c("Mild", "Moderate", "Severe"),
ordered = TRUE)
disease_severity
[1] Mild Severe Moderate Mild Severe
Levels: Mild < Moderate < Severe
# Developmental stages
plant_stage <- factor(c("Seedling", "Juvenile", "Adult", "Seedling", "Adult"),
levels = c("Seedling", "Juvenile", "Adult"),
ordered = TRUE)
plant_stage
[1] Seedling Juvenile Adult Seedling Adult
Levels: Seedling < Juvenile < Adult
# Likert scale responses
agreement <- factor(c("Strongly Disagree", "Agree", "Neutral", "Strongly Agree"),
levels = c("Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"),
ordered = TRUE)
agreement
[1] Strongly Disagree Agree Neutral Strongly Agree
Levels: Strongly Disagree < Disagree < Neutral < Agree < Strongly Agreetreatment <- factor(c("Control", "Treated", "Control", "Treated", "Control"))
# Check if it's a factor
is.factor(treatment)
[1] TRUE
# Get the levels
levels(treatment)
[1] "Control" "Treated"
# Number of levels
nlevels(treatment)
[1] 2
# See the internal structure
str(treatment)
Factor w/ 2 levels "Control","Treated": 1 2 1 2 1
# Convert to character to see underlying integers
as.numeric(treatment)
[1] 1 2 1 2 1
# Table of frequencies
table(treatment)
treatment
Control Treated
3 2# Create a research dataset
research_data <- data.frame(
subject_id = 1:12,
species = factor(rep(c("Mouse", "Rat", "Rabbit"), each = 4)),
treatment = factor(rep(c("Control", "Drug_A", "Drug_B", "Placebo"), 3)),
sex = factor(rep(c("Male", "Female"), 6)),
response = c(23, 45, 67, 12, 34, 56, 78, 21, 43, 65, 87, 32)
)
str(research_data)
'data.frame': 12 obs. of 5 variables:
$ subject_id: int 1 2 3 4 5 6 7 8 9 10 ...
$ species : Factor w/ 3 levels "Mouse","Rabbit",..: 1 1 1 1 3 3 3 3 2 2 ...
$ treatment : Factor w/ 4 levels "Control","Drug_A",..: 1 2 3 4 1 2 3 4 1 2 ...
$ sex : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 1 2 1 2 1 ...
$ response : num 23 45 67 12 34 56 78 21 43 65 ...
# Summary shows factor level counts
summary(research_data)
subject_id species treatment sex response
Min. : 1.00 Mouse :4 Control:3 Female:6 Min. :12.00
1st Qu.: 3.75 Rabbit:4 Drug_A :3 Male :6 1st Qu.:26.25
Median : 6.50 Rat :4 Drug_B :3 Median :42.00
Mean : 6.50 Placebo:3 Mean :44.42
3rd Qu.: 9.25 3rd Qu.:63.50
Max. :12.00 Max. :87.00# Original factor
habitat <- factor(c("F", "G", "W", "F", "G", "W"))
habitat
[1] F G W F G W
Levels: F G W
# Rename levels to be more descriptive
levels(habitat) <- c("Forest", "Grassland", "Wetland")
habitat
[1] Forest Grassland Wetland Forest Grassland Wetland
Levels: Forest Grassland Wetland
# Alternative method using labels during creation
habitat2 <- factor(c("F", "G", "W", "F", "G", "W"),
levels = c("F", "G", "W"),
labels = c("Forest", "Grassland", "Wetland"))
habitat2
[1] Forest Grassland Wetland Forest Grassland Wetland
Levels: Forest Grassland Wetland# Default alphabetical order
size <- factor(c("Large", "Small", "Medium", "Large", "Small"))
size
[1] Large Small Medium Large Small
Levels: Large Medium Small
# Reorder to logical size order
size_ordered <- factor(size, levels = c("Small", "Medium", "Large"))
size_ordered
[1] Large Small Medium Large Small
Levels: Small Medium Large
# Or make it an ordered factor
size_ordinal <- factor(size, levels = c("Small", "Medium", "Large"), ordered = TRUE)
size_ordinal
[1] Large Small Medium Large Small
Levels: Small < Medium < Large# Original factor
season <- factor(c("Spring", "Summer", "Fall", "Spring", "Summer"))
levels(season)
[1] "Fall" "Spring" "Summer"
# Add new level
levels(season) <- c(levels(season), "Winter")
levels(season)
[1] "Fall" "Spring" "Summer" "Winter"
# Drop unused levels
season_used <- season[1:3] # Only first 3 elements
season_used
[1] Spring Summer Fall
Levels: Fall Spring Summer Winter
droplevels(season_used) # Remove unused "Winter" level
[1] Spring Summer Fall
Levels: Fall Spring Summer# Create experimental data
experiment <- data.frame(
genotype = factor(rep(c("Wild_type", "Mutant_A", "Mutant_B"), each = 20)),
condition = factor(rep(rep(c("Control", "Stress"), each = 10), 3)),
survival = factor(c(rep(c("Alive", "Dead"), c(9, 1)), # Wild_type Control
rep(c("Alive", "Dead"), c(7, 3)), # Wild_type Stress
rep(c("Alive", "Dead"), c(8, 2)), # Mutant_A Control
rep(c("Alive", "Dead"), c(4, 6)), # Mutant_A Stress
rep(c("Alive", "Dead"), c(6, 4)), # Mutant_B Control
rep(c("Alive", "Dead"), c(3, 7)))) # Mutant_B Stress
# Cross-tabulation
table(experiment$genotype, experiment$condition)
Control Stress
Mutant_A 10 10
Mutant_B 10 10
Wild_type 10 10
# Three-way table
table(experiment$genotype, experiment$condition, experiment$survival)
, , survival = Alive
condition
genotype Control Stress
Mutant_A 8 4
Mutant_B 6 3
Wild_type 9 7
, , survival = Dead
condition
genotype Control Stress
Mutant_A 2 6
Mutant_B 4 7
Wild_type 1 3# ANOVA example with factors
plant_growth <- data.frame(
treatment = factor(rep(c("Control", "Fertilizer_A", "Fertilizer_B"), each = 8)),
growth_rate = c(12, 14, 13, 15, 11, 16, 12, 14, # Control
18, 20, 19, 22, 17, 21, 18, 20, # Fertilizer_A
25, 27, 24, 28, 23, 29, 25, 26) # Fertilizer_B
)
# Boxplot by treatment group
boxplot(growth_rate ~ treatment, data = plant_growth,
main = "Plant Growth by Treatment",
xlab = "Treatment", ylab = "Growth Rate (cm/week)")
# ANOVA
growth_anova <- aov(growth_rate ~ treatment, data = plant_growth)
summary(growth_anova)
Df Sum Sq Mean Sq F value Pr(>F)
treatment 2 672.3 336.1 134.4 1.18e-11 ***
Residuals 21 52.5 2.5
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Post-hoc comparisons
TukeyHSD(growth_anova)# Character variable
char_treatment <- c("Control", "Treated", "Control", "Treated")
class(char_treatment)
[1] "character"
# Factor variable
factor_treatment <- factor(c("Control", "Treated", "Control", "Treated"))
class(factor_treatment)
[1] "factor"
# Statistical functions behave differently
# This won't work properly with character:
# aov(response ~ char_treatment, data = mydata) # May cause issues
# This works correctly with factor:
# aov(response ~ factor_treatment, data = mydata) # Proper statistical analysis# Character to factor
species_char <- c("Oak", "Pine", "Maple", "Oak", "Pine")
species_factor <- factor(species_char)
species_factor
[1] Oak Pine Maple Oak Pine
Levels: Maple Oak Pine
# Factor to character
as.character(species_factor)
[1] "Oak" "Pine" "Maple" "Oak" "Pine"
# Factor to numeric (gets internal codes)
as.numeric(species_factor)
[1] 2 3 1 2 3
# Proper way to convert factor of numbers to numeric
numeric_factor <- factor(c("10", "20", "15", "10", "25"))
as.numeric(numeric_factor) # Wrong! Gets 1, 2, 3, 1, 4
[1] 1 2 3 1 4
as.numeric(as.character(numeric_factor)) # Correct!
[1] 10 20 15 10 25# Two separate factors
site1_species <- factor(c("Oak", "Pine", "Maple"))
site2_species <- factor(c("Oak", "Birch", "Pine"))
# Combine factors (may create unexpected levels)
combined_simple <- c(site1_species, site2_species)
combined_simple # Character vector, not factor!
[1] "Oak" "Pine" "Maple" "Oak" "Birch" "Pine"
# Proper way to combine factors
combined_factor <- factor(c(as.character(site1_species), as.character(site2_species)))
combined_factor
[1] Oak Pine Maple Oak Birch Pine
Levels: Birch Maple Oak Pine# Original factor
original_treatment <- factor(c("T1", "T2", "T3", "T1", "T2", "T3"))
# Recode to meaningful names
recoded_treatment <- factor(original_treatment,
levels = c("T1", "T2", "T3"),
labels = c("Control", "Low_Nutrient", "High_Nutrient"))
recoded_treatment
[1] Control Low_Nutrient High_Nutrient Control Low_Nutrient High_Nutrient
Levels: Control Low_Nutrient High_Nutrient
# Conditional recoding
ph_values <- factor(c("Low", "Medium", "High", "Low", "Medium", "High"))
ph_binary <- ifelse(ph_values == "High", "Basic", "Not_Basic")
ph_binary <- factor(ph_binary)
ph_binary
[1] Not_Basic Not_Basic Basic Not_Basic Not_Basic Basic
Levels: Basic Not_Basic# Bird survey data
bird_survey <- data.frame(
site_id = 1:24,
habitat = factor(rep(c("Forest", "Grassland", "Wetland", "Urban"), each = 6)),
season = factor(rep(c("Spring", "Summer", "Fall"), 8)),
species = factor(c("Robin", "Sparrow", "Duck", "Pigeon", "Hawk", "Owl",
"Robin", "Sparrow", "Duck", "Pigeon", "Hawk", "Owl",
"Robin", "Sparrow", "Duck", "Pigeon", "Hawk", "Owl",
"Robin", "Sparrow", "Duck", "Pigeon", "Hawk", "Owl")),
abundance = c(12, 8, 5, 15, 2, 3, 18, 12, 8, 20, 4, 5,
6, 4, 15, 8, 1, 7, 25, 18, 3, 35, 8, 12)
)
# Analysis by habitat type
aggregate(abundance ~ habitat, data = bird_survey, mean)
habitat abundance
1 Forest 8.500000
2 Grassland 10.000000
3 Urban 18.333333
4 Wetland 8.666667
# Two-way analysis
aggregate(abundance ~ habitat + season, data = bird_survey, sum)
habitat season abundance
1 Forest Fall 32
2 Grassland Fall 32
3 Urban Fall 75
4 Wetland Fall 40
5 Forest Spring 43
6 Grassland Spring 38
7 Urban Spring 61
8 Wetland Spring 32
9 Forest Summer 36
10Grassland Summer 50
11 Urban Summer 74
12 Wetland Summer 38# Medical research data
clinical_data <- data.frame(
patient_id = 1:60,
treatment = factor(rep(c("Placebo", "Drug_Low", "Drug_High"), each = 20)),
age_group = factor(rep(c("Young", "Middle", "Old"), 20),
levels = c("Young", "Middle", "Old"), ordered = TRUE),
response = factor(c(
rep(c("None", "Mild", "Strong"), c(12, 6, 2)), # Placebo
rep(c("None", "Mild", "Strong"), c(8, 8, 4)), # Drug_Low
rep(c("None", "Mild", "Strong"), c(3, 7, 10)) # Drug_High
), levels = c("None", "Mild", "Strong"), ordered = TRUE)
)
# Response by treatment
response_table <- table(clinical_data$treatment, clinical_data$response)
response_table
None Mild Strong
Drug_High 3 7 10
Drug_Low 8 8 4
Placebo 12 6 2
# Proportion table
prop.table(response_table, margin = 1) # Proportions by treatment
None Mild Strong
Drug_High 0.15 0.35 0.50
Drug_Low 0.40 0.40 0.20
Placebo 0.60 0.30 0.10# Crop yield study
crop_study <- data.frame(
plot = 1:48,
variety = factor(rep(c("Variety_A", "Variety_B", "Variety_C", "Variety_D"), each = 12)),
irrigation = factor(rep(c("Low", "Medium", "High"), 16)),
fertilizer = factor(rep(rep(c("None", "Organic", "Synthetic", "Both"), each = 3), 4)),
yield_kg = runif(48, 800, 1200) # Simulated yield data
)
# Multi-factor ANOVA
yield_model <- aov(yield_kg ~ variety + irrigation + fertilizer, data = crop_study)
summary(yield_model)
# Interaction effects
interaction_model <- aov(yield_kg ~ variety * irrigation, data = crop_study)
summary(interaction_model)# Reading data - strings become factors by default (older R versions)
# Solution: Use stringsAsFactors = FALSE or read with readr package
data <- read.csv("file.csv", stringsAsFactors = FALSE)
# Or convert after reading
data$treatment <- factor(data$treatment)# Default alphabetical order may not be logical
size <- factor(c("Large", "Small", "Medium"))
levels(size)
[1] "Large" "Medium" "Small" # Alphabetical, not logical
# Solution: Specify levels explicitly
size_correct <- factor(size, levels = c("Small", "Medium", "Large"))
levels(size_correct)
[1] "Small" "Medium" "Large" # Logical order# Wrong way
numeric_factor <- factor(c("10", "20", "30"))
as.numeric(numeric_factor) # Gets 1, 2, 3 (internal codes)
[1] 1 2 3
# Right way
as.numeric(as.character(numeric_factor)) # Gets 10, 20, 30
[1] 10 20 30# After subsetting, unused levels remain
full_data <- factor(c("A", "B", "C", "D", "E"))
subset_data <- full_data[1:3]
levels(subset_data)
[1] "A" "B" "C" "D" "E" # D and E still there
# Solution: Drop unused levels
clean_data <- droplevels(subset_data)
levels(clean_data)
[1] "A" "B" "C" # Only used levels remainordered = TRUE for scales and rankingstable() for frequency analysisFactors are fundamental for statistical analysis in R and essential for properly handling categorical variables in scientific research. They ensure that statistical functions interpret your categorical data correctly and enable proper group comparisons and modeling.