-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_exploration.py
More file actions
133 lines (103 loc) · 4.79 KB
/
data_exploration.py
File metadata and controls
133 lines (103 loc) · 4.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
"""
Data Exploration Script for CCTV Face Detection Dataset
Analyzes the dataset structure and provides statistics
"""
import os
import pandas as pd
import glob
def explore_dataset():
"""Explore the dataset structure and provide statistics"""
# Dataset paths
dataset_root = "identity-employees-in-surveillance-cctv/dataset"
train_dir = f"{dataset_root}/train"
test_dir = f"{dataset_root}/test"
reference_dir = f"{dataset_root}/reference_faces"
print("=== CCTV Face Detection Dataset Exploration ===\n")
# 1. Training data analysis
print("1. TRAINING DATA ANALYSIS:")
print("-" * 40)
labels_file = f"{train_dir}/labels.csv"
if os.path.exists(labels_file):
df = pd.read_csv(labels_file)
print(f"Training labels file: {labels_file}")
print(f"Total training samples: {len(df)}")
print(f"Columns: {list(df.columns)}")
# Employee ID distribution
emp_counts = df['emp_id'].value_counts()
print(f"\nUnique employee IDs: {len(emp_counts)}")
print("Employee ID distribution (top 10):")
print(emp_counts.head(10))
# Check for UNKNOWN entries
unknown_count = (df['emp_id'] == 'UNKNOWN').sum()
print(f"UNKNOWN faces in training: {unknown_count}")
# Check if all image files exist
train_images_dir = f"{train_dir}/images"
missing_files = 0
for filename in df['filename']:
if not os.path.exists(f"{train_images_dir}/{filename}"):
missing_files += 1
print(f"Missing training image files: {missing_files}")
print()
# 2. Test data analysis
print("2. TEST DATA ANALYSIS:")
print("-" * 40)
test_images = glob.glob(f"{test_dir}/images/*.jpg")
print(f"Total test images: {len(test_images)}")
print(f"Test images directory: {test_dir}/images")
print()
# 3. Reference faces analysis
print("3. REFERENCE FACES ANALYSIS:")
print("-" * 40)
reference_dirs = [d for d in os.listdir(reference_dir) if os.path.isdir(f"{reference_dir}/{d}")]
reference_dirs = sorted([d for d in reference_dirs if d.startswith('emp')])
print(f"Total employee reference directories: {len(reference_dirs)}")
print(f"Employee IDs: {reference_dirs[:10]}{'...' if len(reference_dirs) > 10 else ''}")
# Count images per employee
emp_image_counts = {}
total_reference_images = 0
for emp_id in reference_dirs:
emp_dir = f"{reference_dir}/{emp_id}"
images = glob.glob(f"{emp_dir}/*.jpg") + glob.glob(f"{emp_dir}/*.mp4")
emp_image_counts[emp_id] = len(images)
total_reference_images += len(images)
print(f"Total reference images/videos: {total_reference_images}")
print(f"Images per employee (min/max/avg): {min(emp_image_counts.values())}/{max(emp_image_counts.values())}/{total_reference_images/len(reference_dirs):.1f}")
# Find employees with videos
employees_with_videos = []
for emp_id in reference_dirs:
emp_dir = f"{reference_dir}/{emp_id}"
videos = glob.glob(f"{emp_dir}/*.mp4")
if videos:
employees_with_videos.append(emp_id)
print(f"Employees with video files: {len(employees_with_videos)}")
print()
# 4. Data consistency checks
print("4. DATA CONSISTENCY CHECKS:")
print("-" * 40)
# Check if all training employee IDs have reference faces
if os.path.exists(labels_file):
training_emp_ids = set(df[df['emp_id'] != 'UNKNOWN']['emp_id'].unique())
reference_emp_ids = set(reference_dirs)
missing_in_reference = training_emp_ids - reference_emp_ids
extra_in_reference = reference_emp_ids - training_emp_ids
print(f"Training employee IDs: {len(training_emp_ids)}")
print(f"Reference employee IDs: {len(reference_emp_ids)}")
print(f"Missing in reference faces: {len(missing_in_reference)} - {list(missing_in_reference)}")
print(f"Extra in reference faces: {len(extra_in_reference)} - {list(extra_in_reference)}")
print()
# 5. Summary
print("5. DATASET SUMMARY:")
print("-" * 40)
print(f"Training samples: {len(df) if 'df' in locals() else 'N/A'}")
print(f"Test samples: {len(test_images)}")
print(f"Reference employees: {len(reference_dirs)}")
print(f"Reference images/videos: {total_reference_images}")
print()
print("Dataset structure appears to be:")
print("- Training: cropped face images with employee ID labels")
print("- Test: cropped face images (no labels)")
print("- Reference: high-quality gallery images per employee")
print("- Task: Face identification with open-set recognition (UNKNOWN class)")
if __name__ == "__main__":
explore_dataset()