Container environments change how file paths work, and mixing I/O with computation makes code harder to test and reuse. Students should write path-agnostic code and separate data loading/saving from analysis logic.
The fundamental issue
# ❌ Breaks in containers - assumes specific host layout
df = pd.read_csv("/Users/student/project/data/survey.csv")
# ❌ Fragile - depends on current working directory
df = pd.read_csv("data/survey.csv") # Where is "data" exactly?
# ✅ Robust - relative to code location
from pathlib import Path
script_dir = Path(__file__).parent
df = pd.read_csv(script_dir / "data" / "survey.csv")
Project root helper pattern
from pathlib import Path
def get_project_root() -> Path:
"""Get the project root directory (where Dockerfile/Makefile live)."""
return Path(__file__).resolve().parents[1] # Adjust number for your structure
# Usage throughout your project
PROJECT_ROOT = get_project_root()
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "outputs"
def load_survey_data() -> pd.DataFrame:
"""Load survey data from the standard location."""
return pd.read_csv(DATA_DIR / "raw" / "survey.csv")
Environment-aware paths
from pydantic_settings import BaseSettings
class PathSettings(BaseSettings):
data_dir: Path = Field(default=Path("./data"))
output_dir: Path = Field(default=Path("./outputs"))
class Config:
env_prefix = "APP_" # APP_DATA_DIR, APP_OUTPUT_DIR
settings = PathSettings()
Key principles for container paths:
WORKDIR in Dockerfile-w /app in docker run commands:ro) in production❌ Bad: I/O mixed with computation
def analyze_survey_data():
"""Analyze survey data and save results."""
# I/O mixed with computation - hard to test
df = pd.read_csv("data/survey.csv")
# Analysis logic
avg_score = df['satisfaction'].mean()
group_stats = df.groupby('department')['satisfaction'].agg(['mean', 'std'])
# More I/O mixed in
results = {
'overall_average': avg_score,
'by_department': group_stats.to_dict()
}
with open("outputs/results.json", "w") as f:
json.dump(results, f)
print(f"Analysis complete. Average satisfaction: {avg_score}")
✅ Good: I/O separated from computation
def calculate_satisfaction_stats(df: pd.DataFrame) -> dict[str, Any]:
"""Calculate satisfaction statistics from survey data.
Pure function - no I/O, easy to test.
"""
avg_score = df['satisfaction'].mean()
group_stats = df.groupby('department')['satisfaction'].agg(['mean', 'std'])
return {
'overall_average': avg_score,
'by_department': group_stats.to_dict()
}
def load_survey_data(data_path: Path) -> pd.DataFrame:
"""Load and validate survey data."""
df = pd.read_csv(data_path)
# Basic validation
required_cols = ['satisfaction', 'department']
missing_cols = set(required_cols) - set(df.columns)
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")
return df
def save_results(results: dict[str, Any], output_path: Path) -> None:
"""Save analysis results to JSON file."""
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(results, f, indent=2)
# Main script coordinates I/O and computation
def main():
"""Main analysis pipeline."""
# I/O layer
data_path = DATA_DIR / "raw" / "survey.csv"
output_path = OUTPUT_DIR / "satisfaction_analysis.json"
df = load_survey_data(data_path)
# Pure computation
results = calculate_satisfaction_stats(df)
# I/O layer
save_results(results, output_path)
# User feedback
avg_score = results['overall_average']
print(f"Analysis complete. Average satisfaction: {avg_score:.2f}")
if __name__ == "__main__":
main()
Testing the separated functions
import pytest
import pandas as pd
from your_package.analysis import calculate_satisfaction_stats
def test_satisfaction_stats():
# Arrange: Create test data (no files needed!)
test_df = pd.DataFrame({
'satisfaction': [4, 5, 3, 4, 5],
'department': ['Engineering', 'Engineering', 'Sales', 'Sales', 'Marketing']
})
# Act: Call pure function
results = calculate_satisfaction_stats(test_df)
# Assert: Check results
assert results['overall_average'] == 4.2
assert 'Engineering' in results['by_department']
assert results['by_department']['Engineering']['mean'] == 4.5
Configurable paths via command line
import argparse
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description="Survey data analysis")
parser.add_argument(
"--data-dir",
type=Path,
default=Path("./data"),
help="Directory containing input data"
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("./outputs"),
help="Directory for output files"
)
parser.add_argument(
"--survey-file",
type=str,
default="survey.csv",
help="Survey data filename"
)
args = parser.parse_args()
# Build paths from arguments
data_path = args.data_dir / args.survey_file
output_path = args.output_dir / "analysis_results.json"
# Run analysis with specified paths
run_analysis(data_path, output_path)
# Usage in different environments:
# Local: python -m analysis --data-dir ./local_data
# Container: python -m analysis --data-dir /mounted/data
Using with Make targets
# Development with local data
run-local:
uv run python -m your_package.main --data-dir ./data/dev
# Container with mounted production data
run-prod:
docker run --rm -it \
-v /prod/data:/data:ro \
-v $(PWD)/outputs:/outputs \
$(IMAGE_NAME) \
uv run python -m your_package.main --data-dir /data --output-dir /outputs
1. Configuration-driven paths
# ✅ Good: Paths come from configuration with validation
from pydantic import BaseModel, Field, validator
from pathlib import Path
class AnalysisConfig(BaseModel):
"""Configuration for analysis pipeline with path validation."""
data_dir: Path = Field(..., description="Directory containing input data")
output_dir: Path = Field(default=Path("./outputs"))
survey_filename: str = Field(default="survey.csv", min_length=1)
@validator('data_dir')
def data_dir_must_exist(cls, v):
if not v.exists():
raise ValueError(f"Data directory does not exist: {v}")
return v
@validator('output_dir')
def create_output_dir(cls, v):
v.mkdir(parents=True, exist_ok=True)
return v
@property
def survey_path(self) -> Path:
return self.data_dir / self.survey_filename
@property
def results_path(self) -> Path:
return self.output_dir / "results.json"
def run_analysis(config: AnalysisConfig):
df = load_survey_data(config.survey_path)
results = calculate_stats(df)
save_results(results, config.results_path)
2. Path validation and error handling
def load_data_safely(data_path: Path) -> pd.DataFrame:
"""Load data with clear error messages."""
if not data_path.exists():
raise FileNotFoundError(
f"Data file not found: {data_path}\n"
f"Expected location: {data_path.absolute()}\n"
f"Working directory: {Path.cwd()}"
)
if not data_path.is_file():
raise ValueError(f"Path exists but is not a file: {data_path}")
return pd.read_csv(data_path)
3. Output directory management
def ensure_output_dir(output_path: Path) -> Path:
"""Create output directory and return the path."""
output_path.parent.mkdir(parents=True, exist_ok=True)
return output_path
def save_with_backup(data: Any, output_path: Path) -> None:
"""Save data, backing up existing file if present."""
if output_path.exists():
backup_path = output_path.with_suffix(f".backup{output_path.suffix}")
output_path.rename(backup_path)
print(f"Backed up existing file to {backup_path}")
# Save new file
ensure_output_dir(output_path)
with open(output_path, 'w') as f:
json.dump(data, f, indent=2)
✅ Good practices to encourage:
pathlib.Path❌ Issues to flag:
if __name__ == "__main__": guardsReview questions to ask:
Progression from problematic to robust:
# Show them what breaks
df = pd.read_csv("/Users/alice/project/data.csv") # Won't work for Bob
from pathlib import Path
data_path = Path(__file__).parent / "data" / "survey.csv"
df = pd.read_csv(data_path)
def load_data(path: Path) -> pd.DataFrame:
return pd.read_csv(path)
def analyze_data(df: pd.DataFrame) -> dict:
return {"mean": df["score"].mean()}
def main():
df = load_data(data_path)
results = analyze_data(df)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data", type=Path, required=True)
args = parser.parse_args()
df = load_data(args.data)
results = analyze_data(df)
Key teaching points: