Remember: We're preparing students for real data science careers, not just academic assignments.
Guide students to solutions rather than fixing code yourself
Fair standards across all projects and teams
Mirror real data science team practices
Raise systemic issues to mentors promptly
Key Insight: Your job is to develop student capabilities, not deliver perfect code.
Code should work the same way for everyone, everywhere
Students internalize industry-level practices
Catch "works on my machine" issues early
Enable consistent, actionable feedback
For a full list of technical standards and best practices: https://dsi-clinic.github.io/ta-training/technical/
import pandas as pd
import numpy as np
def proc(d):
return d.groupby('cat').mean()
def ld(f):
return pd.read_csv(f)
def sv(d, f):
d.to_csv(f)
def main():
d = ld('data.csv')
r = proc(d)
sv(r, 'out.csv')"""Survey data analysis module."""
from pathlib import Path
import pandas as pd
import logging
logger = logging.getLogger(__name__)
def load_survey_data(file_path: Path) -> pd.DataFrame:
"""Load survey data with validation."""
logger.info(f"Loading survey data from {file_path}")
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
return pd.read_csv(file_path)
def calculate_category_averages(
df: pd.DataFrame,
category_col: str = 'category'
) -> pd.DataFrame:
"""Calculate mean values grouped by category."""
return df.groupby(category_col).mean()
def save_results(df: pd.DataFrame, output_path: Path) -> None:
"""Save analysis results to CSV."""
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
logger.info(f"Results saved to {output_path}")Key Teaching Points: Good structure isn't about following rules - it's about making code maintainable and collaborative. Help students see how clear names and documentation save time for their future selves and teammates.
def clean_survey_data(df):
# Print debugging - can't be controlled
print("Starting with", len(df), "rows")
# Remove invalid responses
df = df[df['score'] > 0]
print("After score filter:", len(df))
# Remove outliers
df = df[df['score'] < 100]
print("After outlier removal:", len(df))
return df
# No tests - hope it works!# Manual testing in notebooks
df = pd.read_csv('data.csv')
result = clean_survey_data(df)
# Visual inspection only
result.head()import logging
logger = logging.getLogger(__name__)
def clean_survey_data(df: pd.DataFrame) -> pd.DataFrame:
"""Clean survey data with logging and validation."""
logger.info(f"Starting data cleaning with {len(df)} rows")
# Validate input
required_cols = ['score', 'user_id']
missing = set(required_cols) - set(df.columns)
if missing:
raise ValueError(f"Missing columns: {missing}")
# Remove invalid responses
initial_count = len(df)
df_clean = df[df['score'] > 0]
removed = initial_count - len(df_clean)
logger.info(f"Removed {removed} rows with invalid scores")
# Remove outliers (configurable threshold)
outlier_threshold = 100
df_final = df_clean[df_clean['score'] < outlier_threshold]
outliers = len(df_clean) - len(df_final)
logger.info(f"Removed {outliers} outlier rows")
logger.info(f"Cleaning complete: {len(df_final)} rows remaining")
return df_finalimport pandas as pd
# ARRANGE: Set up test data
test_data = pd.DataFrame({
'user_id': [1, 2, 3, 4],
'score': [85, -5, 95, 0]
})
# ACT: Call the function
result = clean_survey_data(test_data)
# ASSERT: Verify the expected behavior
assert len(result) == 2, f"Expected 2 rows, got {len(result)}"
assert all(result['score'] > 0), "All scores should be positive"
print("✓ Test passed: Invalid scores removed correctly")
# Test error handling
test_data_bad = pd.DataFrame({'other_col': [1, 2, 3]})
try:
clean_survey_data(test_data_bad)
assert False, "Should have raised ValueError"
except ValueError as e:
assert "Missing columns" in str(e)
print("✓ Test passed: Missing columns detected")Key Teaching Points: Systematic debugging saves time and builds professional habits. Look for print statements instead of logging and functions without error handling. The Arrange-Act-Assert pattern structures tests clearly - simple assertions are usually sufficient. While pytest is the professional standard, basic assertions help students start testing without additional tools.
# Dockerfile
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
WORKDIR /app
# Copy dependency files
COPY pyproject.toml uv.lock ./
# Install dependencies using uv
RUN uv sync --frozen --no-dev
# Copy source code
COPY . .
# Default command
CMD ["uv", "run", "python", "app.py"]# Makefile
IMAGE=project-name
.PHONY: build run test lint sync
build:
docker build -t $(IMAGE) .
run:
docker run --rm -it \
-v $(PWD):/app -w /app \
$(IMAGE) uv run python app.py
test:
docker run --rm -it \
-v $(PWD):/app -w /app \
$(IMAGE) uv run pytest -q
lint:
docker run --rm -it \
-v $(PWD):/app -w /app \
$(IMAGE) uv run ruff check .
sync:
docker run --rm -it \
-v $(PWD):/app -w /app \
$(IMAGE) uv syncKey Teaching Points: Modern Python packaging with uv provides faster, more reliable dependency management. Check that projects have pyproject.toml, uv.lock is committed, and make commands use `uv run`. This workflow eliminates "works on my machine" problems.
# 1. Create feature branch
git checkout -b jd/my-feature
# 2. Work and commit regularly
git add .
git commit -m "Add data validation"
# 3. Keep current with main
git checkout main && git pull
git checkout jd/my-feature && git rebase main
# 4. Open PR when ready
# 5. Address feedback
# 6. TA merges after approvalKey Teaching Points: Feature branches encourage experimentation without fear of breaking main. Squash merging keeps history clean and makes it easy to rollback entire features. Help students understand that messy commits on feature branches are fine - the final merge is what matters.
For more code review guidelines and best practices: https://dsi-clinic.github.io/ta-training/technical/code-review.html
/Users/alice/project/data.csvprocess(), calc()# Instead of: "Fix the paths"
# Try:
"I notice this hardcoded path might break on other machines:
`df = pd.read_csv('/Users/alice/project/data.csv')`
Could we use a relative path instead? Something like:
`from pathlib import Path`
`data_path = Path(__file__).parent / 'data' / 'survey.csv'`
`df = pd.read_csv(data_path)`
This way anyone can run your code."# Good escalation message:
"I've noticed 3 teams this week struggling with Docker setup on Windows machines.
They're getting permission errors when mounting volumes. This is blocking their
ability to run `make test`.
Suggested fix: Add Windows-specific Make targets or update the handbook with
Windows Docker Desktop configuration steps.
This affects ~40% of our current cohort. Can we address this in Friday's
all-hands meeting?"def process_survey_data():
data = pd.read_csv('/Users/alice/Documents/clinic/survey_data.csv')
print("Data loaded successfully!")
# Remove invalid responses
clean_data = data[data['satisfaction_score'] > 0]
# Calculate statistics
avg = clean_data['satisfaction_score'].mean()
dept_stats = clean_data.groupby('department')['satisfaction_score'].mean()
# Save results
results = {'overall_avg': avg, 'dept_averages': dept_stats.to_dict()}
with open('analysis_results.json', 'w') as f:
json.dump(results, f)
print(f"Analysis complete! Average satisfaction: {avg}")
return resultsEvery code review, every standard enforced, every coaching conversation shapes how students will approach data science throughout their careers. The habits they learn here will serve them for years to come.
Let's discuss any questions about:
Full handbook available for reference
Examples in technical/examples/ directory
Additional topics covered in appendix slides
Questions? Reach out anytime!
These slides cover essential topics for TAs working with students. Use them as reference during code reviews and student interactions.
Topics covered: AI usage guidelines, repository hygiene, type hints, Python best practices, Pydantic validation, configuration management, dependency management, I/O separation patterns, and logging & error messages.
Key Teaching Points: AI is a tool for learning, not a shortcut to avoid understanding. Help students use AI responsibly by focusing on comprehension and attribution. The goal is building skills, not just producing code.
# While PR is under review, continue building on it
git checkout jd/current-feature
git checkout -b jd/current-feature-part2
# Continue development
git add . && git commit -m "Build on current feature"
# When current PR merges, update and open new PR
git checkout main && git pull
git checkout jd/current-feature-part2
git rebase mainKey Teaching Points: Code debt accumulates quickly over a quarter. Help students develop habits of regular maintenance and prompt response to feedback. A clean repository is easier to review, understand, and maintain.
def process_data(data, config):
# What type is data?
# What does config contain?
# What gets returned?
result = analyze(data, config)
return resultdef calculate_average(scores):
return sum(scores) / len(scores)
# Later...
calculate_average("85,90,78") # TypeError!def process_survey_data(
data: pd.DataFrame,
config: AnalysisConfig
) -> Dict[str, float]:
"""Process survey data and return statistics."""
result = analyze(data, config)
return resultdef calculate_average(scores: List[float]) -> float:
"""Calculate mean of numeric scores."""
return sum(scores) / len(scores)
# Type checker catches error before runtime
calculate_average("85,90,78") # mypy error!Key Teaching Points: Type hints aren't just documentation - they're early error detection. Focus on public functions and anywhere data types aren't obvious. Help students see how type hints make code self-documenting and catch bugs before runtime.
def add_item(item, items=[]):
items.append(item)
return items
# Dangerous! Same list shared across calls
list1 = add_item("apple") # ["apple"]
list2 = add_item("banana") # ["apple", "banana"]results = []
for i in range(3):
# Late binding closure problem
results.append(lambda: i * 2)
# All functions return 4 (i=2)
[f() for f in results] # [4, 4, 4]def add_item(item, items=None):
if items is None:
items = []
items.append(item)
return items
# Each call gets fresh list
list1 = add_item("apple") # ["apple"]
list2 = add_item("banana") # ["banana"]results = []
for i in range(3):
# Capture i explicitly
results.append(lambda x=i: x * 2)
# Each function has its own value
[f() for f in results] # [0, 2, 4]Key Teaching Points: These aren't style issues - they're bugs waiting to happen. Help students understand why Python behaves this way.
def process_config(config_dict):
# What if keys are missing?
# What if values are wrong type?
db_url = config_dict["database_url"]
max_workers = config_dict["max_workers"]
debug = config_dict["debug_mode"]
# Runtime errors waiting to happen
return setup_pipeline(db_url, max_workers, debug)from pydantic import BaseModel, Field
class PipelineConfig(BaseModel):
database_url: str = Field(..., min_length=1)
max_workers: int = Field(default=2, ge=1, le=16)
debug_mode: bool = Field(default=False)
@validator('database_url')
def validate_db_url(cls, v):
if not v.startswith(('postgresql://', 'sqlite://')):
raise ValueError('Invalid database URL')
return v
# Validates automatically on creation
config = PipelineConfig(**config_dict)Key Teaching Points: Pydantic isn't just validation - it's early error detection that saves debugging time. When students load configuration or API data, suggest Pydantic to catch problems before they become mysterious runtime failures.
def analyze_data():
# Hardcoded values scattered throughout
df = pd.read_csv("/Users/alice/data/survey.csv")
# Magic numbers with no context
threshold = 0.95
min_samples = 50
# Database connection hardcoded
conn = sqlite3.connect("project_db.sqlite")
# API key in source code
api_key = "sk-1234567890abcdef"# .env file (often missing .env.example)
DATA_PATH=/Users/alice/data/survey.csv
CONFIDENCE_LEVEL=0.95
MIN_SAMPLES=50
DB_URL=sqlite:///project_db.sqlite
API_KEY=sk-1234567890abcdeffrom pydantic_settings import BaseSettings
from pathlib import Path
class AnalysisConfig(BaseSettings):
"""Centralized configuration with validation."""
# Data paths with validation
data_path: Path = Field(..., description="Path to survey data")
output_dir: Path = Field(default=Path("./outputs"))
# Analysis parameters with constraints
confidence_level: float = Field(default=0.95, ge=0.5, le=0.99)
min_samples: int = Field(default=50, ge=1)
# External services
database_url: str = Field(..., description="Database connection string")
api_key: str = Field(..., description="External API key")
@validator('data_path')
def validate_data_path(cls, v):
if not v.exists():
raise ValueError(f"Data file not found: {v}")
return v
class Config:
env_file = ".env"
# Usage with automatic validation
config = AnalysisConfig()# .env.example - commit this file
DATA_PATH=./data/survey.csv
CONFIDENCE_LEVEL=0.95
MIN_SAMPLES=50
DATABASE_URL=sqlite:///your_database.sqlite
API_KEY=your_api_key_hereKey Teaching Points: Configuration management prevents "works on my machine" problems. Look for hardcoded values, missing .env.example files, and secrets in code. Pydantic settings classes catch configuration errors early and document what's required.
# Old requirements.txt approach
pandas
numpy
scikit-learn
matplotlib
requests# Student runs on their laptop
pip install pandas numpy scikit-learn
# Works for them, breaks for teammates
# Different package versions
# Different Python versions
# Different operating systems# README.md
## Setup
1. Install Python
2. Run the code
3. Hope it works# pyproject.toml
[project]
name = "survey-analysis"
dependencies = [
"pandas==2.2.2",
"numpy==1.26.4",
"scikit-learn==1.4.2",
]
[project.optional-dependencies]
dev = [
"pytest==8.1.1",
"ruff==0.4.2",
]# Dockerfile
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
WORKDIR /app
COPY pyproject.toml uv.lock ./
RUN uv sync --frozen --no-dev
COPY . .
CMD ["uv", "run", "python", "app.py"]# README.md
## Setup
1. Install Docker Desktop
2. Clone this repository
3. Run: `make build`
4. Run: `make test`
5. Start analysis: `make run`
## Development
- `make shell` - Interactive container
- `make sync` - Update dependenciesKey Teaching Points: Modern dependency management with uv and lock files prevents collaboration breakdowns. Check for missing pyproject.toml, unpinned versions, and missing uv.lock files. Docker + uv isn't complexity - it's consistency across team members.
def analyze_survey():
# I/O mixed with computation
df = pd.read_csv("data/survey.csv")
# Analysis logic
avg_score = df['satisfaction'].mean()
# More I/O
with open("results.json", "w") as f:
json.dump({"avg": avg_score}, f)
print(f"Average: {avg_score}")def calculate_satisfaction_stats(
df: pd.DataFrame
) -> Dict[str, float]:
"""Pure function - easy to test."""
return {
'avg_score': df['satisfaction'].mean(),
'std_score': df['satisfaction'].std()
}
def main():
# I/O layer
df = pd.read_csv(DATA_PATH)
# Pure computation
stats = calculate_satisfaction_stats(df)
# I/O layer
save_results(stats, OUTPUT_PATH)Key Teaching Points: I/O separation makes code testable and reusable. Look for functions that both read files AND do computation - that's a refactoring opportunity. Pure functions are easier to test, debug, and understand.
def load_data(path):
df = pd.read_csv(path)
# FileNotFoundError: [Errno 2] No such file
assert 'score' in df.columns
# AssertionError (no context!)
print("Data loaded") # Can't control output
return dfimport logging
logger = logging.getLogger(__name__)
def load_survey_data(path: Path) -> pd.DataFrame:
if not path.exists():
raise FileNotFoundError(
f"Survey file not found: {path}\n"
f"Current directory: {Path.cwd()}\n"
f"Expected location: {path.absolute()}"
)
df = pd.read_csv(path)
required_cols = ['user_id', 'satisfaction']
missing = set(required_cols) - set(df.columns)
if missing:
raise ValueError(
f"Missing columns in {path.name}: {missing}\n"
f"Available: {list(df.columns)}"
)
logger.info(f"Loaded {len(df)} survey responses")
return dfKey Teaching Points: Good error messages are love letters to future debuggers. Replace generic exceptions with specific, actionable messages. Logging beats print statements - it can be controlled and filtered appropriately.