cwicr-data-validator
Validate CWICR data quality and estimate inputs. Check for errors, inconsistencies, outliers, and missing data.
git clone --depth 1 https://github.com/datadrivenconstruction/DDC_Skills_for_AI_Agents_in_Construction /tmp/cwicr-data-validator && cp -r /tmp/cwicr-data-validator/1_DDC_Toolkit/CWICR-Database/cwicr-data-validator ~/.claude/skills/cwicr-data-validatorSKILL.md
# CWICR Data Validator
## Business Case
### Problem Statement
Data quality issues cause:
- Incorrect estimates
- Budget overruns
- Delayed projects
- Rework costs
### Solution
Systematic validation of CWICR data and estimate inputs to catch errors, outliers, and inconsistencies before they impact projects.
### Business Value
- **Error prevention** - Catch issues early
- **Data quality** - Ensure reliable estimates
- **Consistency** - Standard validation rules
- **Audit trail** - Document data issues
## Technical Implementation
```python
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional, Tuple
from dataclasses import dataclass, field
from enum import Enum
from datetime import datetime
class ValidationSeverity(Enum):
"""Validation issue severity."""
ERROR = "error" # Must fix
WARNING = "warning" # Should review
INFO = "info" # For awareness
class ValidationCategory(Enum):
"""Validation categories."""
MISSING_DATA = "missing_data"
INVALID_VALUE = "invalid_value"
OUTLIER = "outlier"
DUPLICATE = "duplicate"
INCONSISTENT = "inconsistent"
FORMAT = "format"
@dataclass
class ValidationIssue:
"""Single validation issue."""
field: str
record_id: str
category: ValidationCategory
severity: ValidationSeverity
message: str
current_value: Any
expected: str
@dataclass
class ValidationResult:
"""Complete validation result."""
total_records: int
valid_records: int
issues: List[ValidationIssue]
error_count: int
warning_count: int
info_count: int
validation_date: datetime
passed: bool
class CWICRDataValidator:
"""Validate CWICR data and estimates."""
# Standard validation rules
REQUIRED_FIELDS = ['work_item_code', 'description', 'unit']
NUMERIC_FIELDS = ['labor_cost', 'material_cost', 'equipment_cost', 'labor_norm']
POSITIVE_FIELDS = ['labor_cost', 'material_cost', 'equipment_cost', 'quantity']
# Outlier detection thresholds (IQR multiplier)
OUTLIER_THRESHOLD = 3.0
def __init__(self, cwicr_reference: pd.DataFrame = None):
self.reference = cwicr_reference
if cwicr_reference is not None:
self._build_reference_stats()
def _build_reference_stats(self):
"""Build reference statistics for outlier detection."""
self._stats = {}
for col in self.NUMERIC_FIELDS:
if col in self.reference.columns:
values = pd.to_numeric(self.reference[col], errors='coerce').dropna()
if len(values) > 0:
self._stats[col] = {
'mean': values.mean(),
'std': values.std(),
'q1': values.quantile(0.25),
'q3': values.quantile(0.75),
'iqr': values.quantile(0.75) - values.quantile(0.25)
}
def validate_dataframe(self, df: pd.DataFrame) -> ValidationResult:
"""Validate entire dataframe."""
issues = []
valid_count = 0
for idx, row in df.iterrows():
row_issues = self._validate_row(row, str(idx))
issues.extend(row_issues)
if not any(i.severity == ValidationSeverity.ERROR for i in row_issues):
valid_count += 1
# Check for duplicates
if 'work_item_code' in df.columns:
duplicates = df[df.duplicated(subset=['work_item_code'], keep=False)]
for idx, row in duplicates.iterrows():
issues.append(ValidationIssue(
field='work_item_code',
record_id=str(idx),
category=ValidationCategory.DUPLICATE,
severity=ValidationSeverity.WARNING,
message=f"Duplicate work item code: {row['work_item_code']}",
current_value=row['work_item_code'],
expected="Unique codes"
))
error_count = sum(1 for i in issues if i.severity == ValidationSeverity.ERROR)
warning_count = sum(1 for i in issues if i.severity == ValidationSeverity.WARNING)
info_count = sum(1 for i in issues if i.severity == ValidationSeverity.INFO)
return ValidationResult(
total_records=len(df),
valid_records=valid_count,
issues=issues,
error_count=error_count,
warning_count=warning_count,
info_count=info_count,
validation_date=datetime.now(),
passed=error_count == 0
)
def _validate_row(self, row: pd.Series, record_id: str) -> List[ValidationIssue]:
"""Validate single row."""
issues = []
# Check required fields
for field in self.REQUIRED_FIELDS:
if field in row.index:
value = row[field]
if pd.isna(value) or str(value).strip() == '':
issues.append(ValidationIssue(
field=field,
record_id=record_id,
category=ValidationCategory.MISSING_DATA,
severity=ValidationSeverity.ERROR,
message=f"Required field '{field}' is missing",
current_value=value,
expected="Non-empty value"
))
# Check numeric fields
for field in self.NUMERIC_FIELDS:
if field in row.index:
value = row[field]
if pd.notna(value):
try:
num_val = float(value)
# Check for negative where positive expected
if field in self.POSITIVE_FIELDS and num_val < 0:
issues.append(ValidationIssue(
field=field,
record_id=record_id,Generate automated daily progress reports from site data. Track work completed, labor hours, equipment usage, and weather conditions.
Analyze labor productivity from site data. Compare planned vs actual, identify trends, benchmark against industry standards.
Create interactive KPI dashboards for construction projects. Track schedule, cost, quality, and safety metrics in real-time.
Detect and analyze geometric clashes in BIM models. Identify MEP, structural, and architectural conflicts before construction.
Classify BIM elements using AI and standard classification systems. Map elements to UniFormat, MasterFormat, OmniClass, and CWICR codes.
Generate comprehensive BIM model validation reports. Check data quality, completeness, and compliance with standards.
Calculate CO2 emissions and carbon footprint from BIM model data. Analyze embodied carbon by material, element, and building system.
Extract quantities from IFC/Revit models for quantity takeoff. Uses DDC converters to get element counts, areas, volumes, lengths with grouping and reporting.