5000-projects-analysis
Analyze 5000+ IFC and Revit projects at scale for patterns, benchmarks, and insights. Big data analysis for construction.
git clone --depth 1 https://github.com/datadrivenconstruction/DDC_Skills_for_AI_Agents_in_Construction /tmp/5000-projects-analysis && cp -r /tmp/5000-projects-analysis/1_DDC_Toolkit/Kaggle-Notebooks/5000-projects-analysis ~/.claude/skills/5000-projects-analysisSKILL.md
# Large-Scale BIM Project Analysis
## Business Case
### Problem Statement
Construction companies lack industry benchmarks because:
- Individual project data is insufficient for statistical analysis
- Comparable project data is not available
- Manual analysis doesn't scale to thousands of projects
### Solution
Analyze 5000+ IFC and Revit projects to extract patterns, create benchmarks, and train ML models for prediction.
### Business Value
- **Industry benchmarks** - Compare your project to 5000+ others
- **Pattern detection** - Identify common designs and issues
- **ML training data** - Build predictive models with real data
- **Research foundation** - Academic and industry research dataset
## Technical Implementation
### Dataset Overview
| Metric | Value |
|--------|-------|
| Total Projects | 5000+ |
| File Formats | IFC, RVT |
| Elements | Millions |
| Categories | 200+ |
### Analysis Pipeline
```python
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List
import matplotlib.pyplot as plt
import seaborn as sns
class BIMProjectAnalyzer:
def __init__(self, data_path: str):
self.data_path = Path(data_path)
self.projects = []
self.elements = None
def load_projects(self) -> int:
"""Load all project data."""
project_files = list(self.data_path.glob("*.xlsx"))
for f in project_files:
try:
df = pd.read_excel(f, sheet_name="Elements")
df['ProjectId'] = f.stem
self.projects.append(df)
except Exception as e:
print(f"Error loading {f}: {e}")
self.elements = pd.concat(self.projects, ignore_index=True)
return len(self.projects)
def project_statistics(self) -> pd.DataFrame:
"""Calculate statistics per project."""
stats = self.elements.groupby('ProjectId').agg({
'ElementId': 'count',
'Category': 'nunique',
'Volume': ['sum', 'mean'],
'Area': ['sum', 'mean']
}).reset_index()
stats.columns = [
'ProjectId', 'ElementCount', 'CategoryCount',
'TotalVolume', 'AvgVolume', 'TotalArea', 'AvgArea'
]
return stats
def category_distribution(self) -> pd.DataFrame:
"""Analyze element distribution across categories."""
dist = self.elements.groupby('Category').agg({
'ElementId': 'count',
'ProjectId': 'nunique',
'Volume': 'sum',
'Area': 'sum'
}).reset_index()
dist.columns = ['Category', 'ElementCount', 'ProjectCount',
'TotalVolume', 'TotalArea']
dist['AvgPerProject'] = dist['ElementCount'] / dist['ProjectCount']
return dist.sort_values('ElementCount', ascending=False)
def find_outliers(self, column: str, threshold: float = 3.0) -> pd.DataFrame:
"""Find projects with outlier values."""
stats = self.project_statistics()
mean = stats[column].mean()
std = stats[column].std()
z_scores = np.abs((stats[column] - mean) / std)
outliers = stats[z_scores > threshold]
return outliers
def benchmark_project(self, project_id: str) -> Dict:
"""Compare project against dataset benchmarks."""
stats = self.project_statistics()
project = stats[stats['ProjectId'] == project_id].iloc[0]
percentiles = {}
for col in ['ElementCount', 'TotalVolume', 'TotalArea']:
percentile = (stats[col] < project[col]).mean() * 100
percentiles[col] = round(percentile, 1)
return {
'project_id': project_id,
'percentiles': percentiles,
'above_average': {
col: project[col] > stats[col].mean()
for col in ['ElementCount', 'TotalVolume', 'TotalArea']
}
}
def generate_report(self, output_path: str) -> str:
"""Generate comprehensive analysis report."""
stats = self.project_statistics()
cat_dist = self.category_distribution()
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Element count distribution
axes[0, 0].hist(stats['ElementCount'], bins=50, edgecolor='black')
axes[0, 0].set_title('Element Count Distribution')
axes[0, 0].set_xlabel('Elements per Project')
# Top categories
top_cats = cat_dist.head(15)
axes[0, 1].barh(top_cats['Category'], top_cats['ElementCount'])
axes[0, 1].set_title('Top 15 Categories')
# Volume distribution
axes[1, 0].hist(stats['TotalVolume'], bins=50, edgecolor='black')
axes[1, 0].set_title('Total Volume Distribution')
# Category count vs Element count
axes[1, 1].scatter(stats['CategoryCount'], stats['ElementCount'], alpha=0.5)
axes[1, 1].set_xlabel('Category Count')
axes[1, 1].set_ylabel('Element Count')
axes[1, 1].set_title('Complexity Analysis')
plt.tight_layout()
plt.savefig(output_path, dpi=150)
return output_path
```
### Analysis Examples
```python
# Initialize analyzer
analyzer = BIMProjectAnalyzer("C:/Data/5000_Projects")
# Load all projects
num_projects = analyzer.load_projects()
print(f"Loaded {num_projects} projects")
# Get statistics
stats = analyzer.project_statistics()
print("\nDataset Summary:")
print(f" Total elements: {analyzer.elements.shape[0]:,}")
print(f" Avg elements/project: {stats['ElementCount'].mean():,.0f}")
print(f" Avg volume/project: {stats['TotalVolume'].mean():,.2f} m³")
# Category analysis
categories = analyzer.category_distribution()
print("\nTop 10 Categories:")
print(categories.head(10)[['Category', 'ElementCount', 'AvgPerProject']])
# Benchmark a specific project
benchmark = analyzer.benchmark_project("MyProject_001")
print(f"\nProject Benchmark:")
print(f" Element count: {benchmark['percentilGenerate automated daily progress reports from site data. Track work completed, labor hours, equipment usage, and weather conditions.
Analyze labor productivity from site data. Compare planned vs actual, identify trends, benchmark against industry standards.
Create interactive KPI dashboards for construction projects. Track schedule, cost, quality, and safety metrics in real-time.
Detect and analyze geometric clashes in BIM models. Identify MEP, structural, and architectural conflicts before construction.
Classify BIM elements using AI and standard classification systems. Map elements to UniFormat, MasterFormat, OmniClass, and CWICR codes.
Generate comprehensive BIM model validation reports. Check data quality, completeness, and compliance with standards.
Calculate CO2 emissions and carbon footprint from BIM model data. Analyze embodied carbon by material, element, and building system.
Extract quantities from IFC/Revit models for quantity takeoff. Uses DDC converters to get element counts, areas, volumes, lengths with grouping and reporting.