Skip to main content
ClaudeWave
Skill173 repo starsupdated 3mo ago

cwicr-data-loader

Load and parse DDC CWICR construction cost database from multiple formats: Parquet, Excel, CSV, Qdrant snapshots. Foundation for all CWICR operations.

Install in Claude Code
Copy
git clone --depth 1 https://github.com/datadrivenconstruction/DDC_Skills_for_AI_Agents_in_Construction /tmp/cwicr-data-loader && cp -r /tmp/cwicr-data-loader/1_DDC_Toolkit/CWICR-Database/cwicr-data-loader ~/.claude/skills/cwicr-data-loader
Then start a new Claude Code session; the skill loads automatically.

SKILL.md

# CWICR Data Loader

## Business Case

### Problem Statement
DDC CWICR database is distributed in multiple formats:
- Apache Parquet (optimized for analytics)
- Excel workbooks (human-readable)
- CSV files (universal exchange)
- Qdrant snapshots (vector search)

Applications need unified data access regardless of source format.

### Solution
Universal data loader supporting all CWICR formats with automatic schema detection, validation, and pandas DataFrame conversion.

### Business Value
- **Format agnostic** - Load from any CWICR distribution
- **Validated data** - Automatic schema validation
- **Memory efficient** - Lazy loading for large datasets
- **Type-safe** - Proper data types preserved

## Technical Implementation

### Prerequisites
```bash
pip install pandas pyarrow openpyxl qdrant-client
```

### Python Implementation

```python
import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path
from typing import Optional, Dict, Any, List, Union
from dataclasses import dataclass, field
from enum import Enum
import json


class CWICRFormat(Enum):
    """Supported CWICR data formats."""
    PARQUET = "parquet"
    EXCEL = "excel"
    CSV = "csv"
    QDRANT = "qdrant"
    JSON = "json"


class CWICRLanguage(Enum):
    """Supported languages in CWICR database."""
    ARABIC = "ar"
    CHINESE = "zh"
    GERMAN = "de"
    ENGLISH = "en"
    SPANISH = "es"
    FRENCH = "fr"
    HINDI = "hi"
    PORTUGUESE = "pt"
    RUSSIAN = "ru"


@dataclass
class CWICRSchema:
    """CWICR database schema definition."""

    # Core fields
    work_item_code: str = "work_item_code"
    description: str = "description"
    unit: str = "unit"
    category: str = "category"

    # Cost fields
    unit_price: str = "unit_price"
    labor_cost: str = "labor_cost"
    material_cost: str = "material_cost"
    equipment_cost: str = "equipment_cost"
    overhead_cost: str = "overhead_cost"

    # Norm fields
    labor_norm: str = "labor_norm"
    material_norm: str = "material_norm"
    equipment_norm: str = "equipment_norm"

    # Metadata
    language: str = "language"
    region: str = "region"
    currency: str = "currency"
    last_updated: str = "last_updated"

    # Optional embedding
    embedding: str = "embedding"


@dataclass
class CWICRWorkItem:
    """Represents a single work item from CWICR database."""
    work_item_code: str
    description: str
    unit: str
    category: str

    unit_price: float = 0.0
    labor_cost: float = 0.0
    material_cost: float = 0.0
    equipment_cost: float = 0.0
    overhead_cost: float = 0.0

    labor_norm: float = 0.0
    labor_unit: str = "h"

    resources: List[Dict[str, Any]] = field(default_factory=list)

    language: str = "en"
    region: str = ""
    currency: str = "USD"


@dataclass
class CWICRResource:
    """Represents a resource (material, labor, equipment)."""
    resource_code: str
    description: str
    unit: str
    unit_price: float
    resource_type: str  # 'labor', 'material', 'equipment'
    category: str = ""


class CWICRDataLoader:
    """Universal loader for CWICR database formats."""

    REQUIRED_COLUMNS = ['work_item_code', 'description', 'unit']
    NUMERIC_COLUMNS = ['unit_price', 'labor_cost', 'material_cost',
                       'equipment_cost', 'labor_norm']

    def __init__(self):
        self.schema = CWICRSchema()
        self._cache: Dict[str, pd.DataFrame] = {}

    def load(self, source: str,
             format: Optional[CWICRFormat] = None,
             language: Optional[CWICRLanguage] = None,
             use_cache: bool = True) -> pd.DataFrame:
        """Load CWICR data from any supported source."""

        cache_key = f"{source}_{language}"
        if use_cache and cache_key in self._cache:
            return self._cache[cache_key]

        # Auto-detect format if not specified
        if format is None:
            format = self._detect_format(source)

        # Load based on format
        if format == CWICRFormat.PARQUET:
            df = self._load_parquet(source)
        elif format == CWICRFormat.EXCEL:
            df = self._load_excel(source)
        elif format == CWICRFormat.CSV:
            df = self._load_csv(source)
        elif format == CWICRFormat.JSON:
            df = self._load_json(source)
        else:
            raise ValueError(f"Unsupported format: {format}")

        # Validate and normalize
        df = self._validate_schema(df)
        df = self._normalize_types(df)

        # Filter by language if specified
        if language and 'language' in df.columns:
            df = df[df['language'] == language.value]

        # Cache result
        if use_cache:
            self._cache[cache_key] = df

        return df

    def _detect_format(self, source: str) -> CWICRFormat:
        """Auto-detect data format from source."""
        path = Path(source)

        if path.suffix.lower() == '.parquet':
            return CWICRFormat.PARQUET
        elif path.suffix.lower() in ['.xlsx', '.xls']:
            return CWICRFormat.EXCEL
        elif path.suffix.lower() == '.csv':
            return CWICRFormat.CSV
        elif path.suffix.lower() == '.json':
            return CWICRFormat.JSON
        else:
            raise ValueError(f"Cannot detect format: {source}")

    def _load_parquet(self, source: str) -> pd.DataFrame:
        """Load from Parquet file."""
        return pd.read_parquet(source)

    def _load_excel(self, source: str,
                    sheet_name: str = "WorkItems") -> pd.DataFrame:
        """Load from Excel workbook."""
        try:
            return pd.read_excel(source, sheet_name=sheet_name)
        except:
            # Try first sheet if named sheet doesn't exist
            return pd.read_excel(source, sheet_name=0)

    def _load_csv(self, source: str) -> pd.DataFrame:
        """Load from CSV file."""
        # Try different encodings
        for encoding in ['utf-8', 'latin-1', 'cp1252']:
            try:
                return pd.read_csv(sou