Create train_notebook_2.ipynb
Browse files- train_notebook_2.ipynb +1454 -0
train_notebook_2.ipynb
ADDED
|
@@ -0,0 +1,1454 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Pentachoron Constellation — Multi-Channel, HF Push, and Dataset Sweep
|
| 4 |
+
Apache-2.0
|
| 5 |
+
Author: AbstractPhil
|
| 6 |
+
Quartermaster: Mirel (GPT-5 Thinking)
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import os, sys, json, math, time, random, shutil, zipfile, platform
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
from typing import List, Tuple, Dict, Optional
|
| 15 |
+
|
| 16 |
+
import numpy as np
|
| 17 |
+
import torch
|
| 18 |
+
import torch.nn as nn
|
| 19 |
+
import torch.nn.functional as F
|
| 20 |
+
from torchvision import datasets, transforms
|
| 21 |
+
from torch.utils.data import DataLoader
|
| 22 |
+
from tqdm import tqdm
|
| 23 |
+
from sklearn.metrics import confusion_matrix
|
| 24 |
+
|
| 25 |
+
## ---------------------------------------------------------------------
|
| 26 |
+
## Fast settings / safety
|
| 27 |
+
## ---------------------------------------------------------------------
|
| 28 |
+
#torch.autograd.set_detect_anomaly(False)
|
| 29 |
+
#if torch.cuda.is_available():
|
| 30 |
+
# torch.backends.cudnn.benchmark = True
|
| 31 |
+
# torch.cuda.empty_cache()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ---------------------------------------------------------------------
|
| 35 |
+
# Configuration (edit these)
|
| 36 |
+
# ---------------------------------------------------------------------
|
| 37 |
+
config: Dict = {
|
| 38 |
+
# Model dims
|
| 39 |
+
"input_dim": 28*28, # will be set by loader
|
| 40 |
+
"input_channels": "auto", # "auto" | 1 | 3 ; loader enforces
|
| 41 |
+
"base_dim": 56,
|
| 42 |
+
"proj_dim": None,
|
| 43 |
+
|
| 44 |
+
# Constellation
|
| 45 |
+
"num_classes": 10, # set by loader
|
| 46 |
+
"num_pentachoron_pairs": 2,
|
| 47 |
+
"lambda_separation": 0.391,
|
| 48 |
+
|
| 49 |
+
# Attention / extractor
|
| 50 |
+
"num_heads": 2,
|
| 51 |
+
"channels": 24,
|
| 52 |
+
|
| 53 |
+
# Training
|
| 54 |
+
"batch_size": 1024,
|
| 55 |
+
"epochs": 20,
|
| 56 |
+
"lr": 1e-2,
|
| 57 |
+
"weight_decay": 1e-5,
|
| 58 |
+
"temp": 0.7,
|
| 59 |
+
|
| 60 |
+
# Loss weights
|
| 61 |
+
"w_ce": 1.0,
|
| 62 |
+
"w_dual": 1.0,
|
| 63 |
+
"w_rose": 1.0,
|
| 64 |
+
"w_diag": 0.1,
|
| 65 |
+
"w_reg": 0.1, # default geom reg
|
| 66 |
+
|
| 67 |
+
# Legacy compat
|
| 68 |
+
"loss_weight_scalar": 0.1,
|
| 69 |
+
|
| 70 |
+
# Dataset override knobs
|
| 71 |
+
"img_size": 28, # unified target size
|
| 72 |
+
"img_channels": "auto", # "auto" | 1 | 3 ; coerces all sets
|
| 73 |
+
"normalize": True,
|
| 74 |
+
"per_dataset_norm": True,
|
| 75 |
+
"augment": False, # safe light aug
|
| 76 |
+
|
| 77 |
+
# Sweep control
|
| 78 |
+
"sweep_all": False, # set True to run all datasets
|
| 79 |
+
"seed": 420,
|
| 80 |
+
|
| 81 |
+
# Hugging Face
|
| 82 |
+
"hf_repo_id": "AbstractPhil/pentachora-multi-channel-frequency-encoded",
|
| 83 |
+
"dataset": "QMNIST",
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# --- HF pathing / naming ---
|
| 87 |
+
config.setdefault("hf_subdir_root", "")
|
| 88 |
+
config.setdefault("hf_dataset_dir_template", "{dataset}") # folder under root
|
| 89 |
+
config.setdefault("hf_run_dir_template", "{ts}_{dataset}") # or "{ts}"
|
| 90 |
+
config.setdefault("hf_weight_suffix_dataset", True) # encoder_{dataset}.safetensors etc.
|
| 91 |
+
config.setdefault("hf_preserve_case", True) # keep DatasetName casing in paths
|
| 92 |
+
|
| 93 |
+
# --- Reproducibility / determinism ---
|
| 94 |
+
config.setdefault("deterministic", True) # set cudnn deterministic + disable benchmark
|
| 95 |
+
config.setdefault("strict_determinism", False) # torch.use_deterministic_algorithms(True)
|
| 96 |
+
config.setdefault("deterministic_cublas", False) # set CUBLAS_WORKSPACE_CONFIG
|
| 97 |
+
config.setdefault("seed_per_dataset", False) # re-seed using dataset name in sweep
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ---------------------------------------------------------------------
|
| 101 |
+
# Fast settings / safety
|
| 102 |
+
# ---------------------------------------------------------------------
|
| 103 |
+
torch.autograd.set_detect_anomaly(False)
|
| 104 |
+
|
| 105 |
+
# Determinism knobs (must be set before layers allocate kernels)
|
| 106 |
+
if bool(config.get("deterministic", True)):
|
| 107 |
+
torch.backends.cudnn.benchmark = False
|
| 108 |
+
torch.backends.cudnn.deterministic = True
|
| 109 |
+
else:
|
| 110 |
+
torch.backends.cudnn.benchmark = True
|
| 111 |
+
|
| 112 |
+
# TF32 off → numerically stable & repeatable on Ampere+
|
| 113 |
+
torch.backends.cudnn.allow_tf32 = False
|
| 114 |
+
torch.backends.cuda.matmul.allow_tf32 = False
|
| 115 |
+
|
| 116 |
+
# cuBLAS deterministic workspace (opt-in; can slow some kernels)
|
| 117 |
+
if bool(config.get("deterministic_cublas", False)):
|
| 118 |
+
os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
|
| 119 |
+
|
| 120 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 121 |
+
print(f"Using device: {device}")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
print("\n" + "="*60)
|
| 126 |
+
print("PENTACHORON CONSTELLATION CONFIGURATION")
|
| 127 |
+
print("="*60)
|
| 128 |
+
for k, v in config.items():
|
| 129 |
+
print(f"{k:24}: {v}")
|
| 130 |
+
|
| 131 |
+
# ---------------------------------------------------------------------
|
| 132 |
+
# Reproducibility
|
| 133 |
+
# ---------------------------------------------------------------------
|
| 134 |
+
# ---------------------------------------------------------------------
|
| 135 |
+
# Reproducibility
|
| 136 |
+
# ---------------------------------------------------------------------
|
| 137 |
+
def seed_everything(seed: int = 42,
|
| 138 |
+
deterministic: bool | None = None,
|
| 139 |
+
strict: bool | None = None):
|
| 140 |
+
"""Seed Python, NumPy, Torch (CPU+CUDA), and set hash seed/env flags."""
|
| 141 |
+
if deterministic is None:
|
| 142 |
+
deterministic = bool(config.get("deterministic", True))
|
| 143 |
+
if strict is None:
|
| 144 |
+
strict = bool(config.get("strict_determinism", False))
|
| 145 |
+
|
| 146 |
+
# OS / interpreter
|
| 147 |
+
os.environ["PYTHONHASHSEED"] = str(seed)
|
| 148 |
+
try:
|
| 149 |
+
import torch
|
| 150 |
+
torch.use_deterministic_algorithms(strict) # raises on nondet ops if True
|
| 151 |
+
except Exception:
|
| 152 |
+
pass
|
| 153 |
+
|
| 154 |
+
# RNGs
|
| 155 |
+
random.seed(seed)
|
| 156 |
+
np.random.seed(seed)
|
| 157 |
+
torch.manual_seed(seed)
|
| 158 |
+
if torch.cuda.is_available():
|
| 159 |
+
torch.cuda.manual_seed(seed)
|
| 160 |
+
torch.cuda.manual_seed_all(seed)
|
| 161 |
+
|
| 162 |
+
def make_torch_generator(seed: int) -> torch.Generator:
|
| 163 |
+
g = torch.Generator()
|
| 164 |
+
g.manual_seed(seed)
|
| 165 |
+
return g
|
| 166 |
+
|
| 167 |
+
def seed_worker(worker_id: int):
|
| 168 |
+
"""Seed DataLoader worker; uses PyTorch's initial_seed to derive unique stream."""
|
| 169 |
+
worker_seed = torch.initial_seed() % 2**32
|
| 170 |
+
np.random.seed(worker_seed)
|
| 171 |
+
random.seed(worker_seed)
|
| 172 |
+
|
| 173 |
+
# Initial global seed
|
| 174 |
+
seed_everything(int(config.get("seed", 42)))
|
| 175 |
+
|
| 176 |
+
# ---------------------------------------------------------------------
|
| 177 |
+
# Setup & deps
|
| 178 |
+
# ---------------------------------------------------------------------
|
| 179 |
+
def _ensure(pkg, pip_name=None):
|
| 180 |
+
pip_name = pip_name or pkg
|
| 181 |
+
try:
|
| 182 |
+
__import__(pkg)
|
| 183 |
+
except Exception:
|
| 184 |
+
print(f"[setup] Installing {pip_name} ...")
|
| 185 |
+
os.system(f"{sys.executable} -m pip install -q {pip_name}")
|
| 186 |
+
|
| 187 |
+
_ensure("safetensors")
|
| 188 |
+
_ensure("huggingface_hub")
|
| 189 |
+
_ensure("pandas")
|
| 190 |
+
_ensure("psutil")
|
| 191 |
+
_ensure("medmnist")
|
| 192 |
+
|
| 193 |
+
from safetensors.torch import save_file as save_safetensors
|
| 194 |
+
from huggingface_hub import HfApi, create_repo, whoami, login
|
| 195 |
+
import pandas as pd
|
| 196 |
+
import psutil
|
| 197 |
+
from torch.utils.tensorboard import SummaryWriter
|
| 198 |
+
|
| 199 |
+
# ---------------------------------------------------------------------
|
| 200 |
+
# Small utils
|
| 201 |
+
# ---------------------------------------------------------------------
|
| 202 |
+
def _timestamp() -> str:
|
| 203 |
+
return datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 204 |
+
|
| 205 |
+
def _param_count(m: nn.Module) -> int:
|
| 206 |
+
return sum(p.numel() for p in m.parameters())
|
| 207 |
+
|
| 208 |
+
def _resolve_repo_id(cfg: Dict) -> str:
|
| 209 |
+
rid = os.getenv("PENTACHORA_HF_REPO") or cfg.get("hf_repo_id")
|
| 210 |
+
if not rid:
|
| 211 |
+
raise RuntimeError("Set config['hf_repo_id'] or export PENTACHORA_HF_REPO.")
|
| 212 |
+
return rid
|
| 213 |
+
|
| 214 |
+
def _hf_login_if_needed():
|
| 215 |
+
try:
|
| 216 |
+
_ = whoami()
|
| 217 |
+
except Exception:
|
| 218 |
+
token = os.getenv("HF_TOKEN")
|
| 219 |
+
if token:
|
| 220 |
+
login(token=token, add_to_git_credential=True)
|
| 221 |
+
else:
|
| 222 |
+
print("[huggingface] No login found and HF_TOKEN not set. Push may fail; run `huggingface-cli login`.")
|
| 223 |
+
|
| 224 |
+
def _ensure_repo(repo_id: str) -> HfApi:
|
| 225 |
+
api = HfApi()
|
| 226 |
+
create_repo(repo_id=repo_id, private=False, exist_ok=True, repo_type="model")
|
| 227 |
+
return api
|
| 228 |
+
|
| 229 |
+
def _zip_dir(src: Path, dst_zip: Path):
|
| 230 |
+
with zipfile.ZipFile(dst_zip, "w", zipfile.ZIP_DEFLATED) as z:
|
| 231 |
+
for p in src.rglob("*"):
|
| 232 |
+
z.write(p, arcname=p.relative_to(src))
|
| 233 |
+
|
| 234 |
+
def _dataset_slug(name_or_names) -> str:
|
| 235 |
+
if isinstance(name_or_names, (list, tuple)):
|
| 236 |
+
return "+".join(n.strip().lower() for n in name_or_names)
|
| 237 |
+
return str(name_or_names).strip().lower()
|
| 238 |
+
|
| 239 |
+
# ---------------------------------------------------------------------
|
| 240 |
+
# Dataset loader (TorchVision + MedMNIST), config-aware
|
| 241 |
+
# ---------------------------------------------------------------------
|
| 242 |
+
try:
|
| 243 |
+
import medmnist
|
| 244 |
+
from medmnist import INFO as MED_INFO
|
| 245 |
+
except Exception:
|
| 246 |
+
medmnist = None
|
| 247 |
+
MED_INFO = None
|
| 248 |
+
|
| 249 |
+
_TORCHVISION_KEYS = {
|
| 250 |
+
"mnist": "MNIST",
|
| 251 |
+
"fashionmnist": "FashionMNIST",
|
| 252 |
+
"kmnist": "KMNIST",
|
| 253 |
+
"emnist": "EMNIST", # balanced
|
| 254 |
+
"qmnist": "QMNIST",
|
| 255 |
+
"usps": "USPS",
|
| 256 |
+
}
|
| 257 |
+
_MEDMNIST_MAP = {
|
| 258 |
+
"bloodmnist": "bloodmnist", "pathmnist": "pathmnist", "octmnist": "octmnist",
|
| 259 |
+
"pneumoniamnist": "pneumoniamnist", "dermamnist": "dermamnist", "retinamnist": "retinamnist",
|
| 260 |
+
"breastmnist": "breastmnist", "organamnist": "organamnist", "organcmnist": "organcmnist",
|
| 261 |
+
"organsmnist": "organsmnist", "tissuemnist": "tissuemnist",
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
_DATASET_STATS_1CH = {
|
| 265 |
+
"MNIST": ([0.1307], [0.3081]),
|
| 266 |
+
"FashionMNIST": ([0.2860], [0.3530]),
|
| 267 |
+
"KMNIST": ([0.1918], [0.3483]),
|
| 268 |
+
"EMNIST": ([0.1307], [0.3081]),
|
| 269 |
+
"QMNIST": ([0.1307], [0.3081]),
|
| 270 |
+
"USPS": ([0.5000], [0.5000]),
|
| 271 |
+
}
|
| 272 |
+
_MEAN1, _STD1 = [0.5], [0.5]
|
| 273 |
+
_MEAN3, _STD3 = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
|
| 274 |
+
|
| 275 |
+
def _norm_stats(name: str, channels: int) -> Tuple[List[float], List[float]]:
|
| 276 |
+
if channels == 1:
|
| 277 |
+
return _DATASET_STATS_1CH.get(name, (_MEAN1, _STD1))
|
| 278 |
+
return _MEAN3, _STD3
|
| 279 |
+
|
| 280 |
+
def _to_channels(target_c: int):
|
| 281 |
+
def _fn(t: torch.Tensor) -> torch.Tensor:
|
| 282 |
+
c = t.shape[0]
|
| 283 |
+
if c == target_c:
|
| 284 |
+
return t
|
| 285 |
+
if target_c == 1:
|
| 286 |
+
if c == 3:
|
| 287 |
+
r, g, b = t[0], t[1], t[2]
|
| 288 |
+
gray = 0.2989*r + 0.5870*g + 0.1140*b
|
| 289 |
+
return gray.unsqueeze(0)
|
| 290 |
+
return t[:1]
|
| 291 |
+
if target_c == 3:
|
| 292 |
+
if c == 1:
|
| 293 |
+
return t.repeat(3, 1, 1)
|
| 294 |
+
return t[:3]
|
| 295 |
+
return t[:target_c]
|
| 296 |
+
return transforms.Lambda(_fn)
|
| 297 |
+
|
| 298 |
+
def _augmentations_for(name: str, size: int, channels: int) -> List[transforms.Transform]:
|
| 299 |
+
aug = []
|
| 300 |
+
if not bool(config.get("augment", False)):
|
| 301 |
+
return aug
|
| 302 |
+
if name.upper() in {"MNIST","KMNIST","EMNIST","QMNIST","USPS"}:
|
| 303 |
+
aug += [transforms.RandomAffine(degrees=8, translate=(0.05, 0.05), scale=(0.95, 1.05))]
|
| 304 |
+
if size >= 32:
|
| 305 |
+
pad = max(1, int(0.03 * size))
|
| 306 |
+
aug += [transforms.RandomCrop(size, padding=pad)]
|
| 307 |
+
return aug
|
| 308 |
+
if size >= 32:
|
| 309 |
+
pad = max(1, int(0.03 * size))
|
| 310 |
+
aug += [transforms.RandomCrop(size, padding=pad)]
|
| 311 |
+
aug += [transforms.RandomAffine(degrees=10, translate=(0.05, 0.05), scale=(0.95, 1.05))]
|
| 312 |
+
if channels == 3 and name.lower().endswith("mnist"):
|
| 313 |
+
aug += [transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.05, hue=0.02)]
|
| 314 |
+
return aug
|
| 315 |
+
|
| 316 |
+
def _build_transforms(dataset_name: str, split: str, native_c: int, target_c: int|str, size: int, normalize: bool, per_dataset_norm: bool) -> transforms.Compose:
|
| 317 |
+
t: List[transforms.Transform] = []
|
| 318 |
+
if size != 28:
|
| 319 |
+
t.append(transforms.Resize((size, size)))
|
| 320 |
+
t.append(transforms.ToTensor())
|
| 321 |
+
out_c = native_c
|
| 322 |
+
if target_c != "auto":
|
| 323 |
+
t.append(_to_channels(int(target_c)))
|
| 324 |
+
out_c = int(target_c)
|
| 325 |
+
if split == "train":
|
| 326 |
+
t = _augmentations_for(dataset_name, size, out_c) + t
|
| 327 |
+
if normalize:
|
| 328 |
+
if per_dataset_norm:
|
| 329 |
+
mean, std = _norm_stats(dataset_name, out_c)
|
| 330 |
+
else:
|
| 331 |
+
mean, std = (_MEAN1, _STD1) if out_c == 1 else (_MEAN3, _STD3)
|
| 332 |
+
t.append(transforms.Normalize(mean=mean, std=std))
|
| 333 |
+
t.append(transforms.Lambda(lambda x: x.view(-1)))
|
| 334 |
+
return transforms.Compose(t)
|
| 335 |
+
|
| 336 |
+
def collate_as_int(batch):
|
| 337 |
+
xs, ys = zip(*batch)
|
| 338 |
+
xs = torch.stack(xs, dim=0)
|
| 339 |
+
_ys = []
|
| 340 |
+
for y in ys:
|
| 341 |
+
if isinstance(y, (int, np.integer)):
|
| 342 |
+
_ys.append(int(y))
|
| 343 |
+
elif torch.is_tensor(y):
|
| 344 |
+
if y.ndim == 0: _ys.append(int(y.item()))
|
| 345 |
+
elif y.ndim == 1 and y.numel()==1: _ys.append(int(y.item()))
|
| 346 |
+
else: _ys.append(int(y.argmax().item()))
|
| 347 |
+
else:
|
| 348 |
+
arr = np.asarray(y)
|
| 349 |
+
if arr.ndim == 0 or (arr.ndim==1 and arr.size==1):
|
| 350 |
+
_ys.append(int(arr.item()))
|
| 351 |
+
else:
|
| 352 |
+
_ys.append(int(arr.argmax()))
|
| 353 |
+
ys_tensor = torch.tensor(_ys, dtype=torch.long)
|
| 354 |
+
return xs, ys_tensor
|
| 355 |
+
|
| 356 |
+
def _get_med_info(flag: str) -> dict:
|
| 357 |
+
if MED_INFO is None:
|
| 358 |
+
raise ImportError("medmnist is not installed. `pip install medmnist`")
|
| 359 |
+
if flag not in MED_INFO:
|
| 360 |
+
raise KeyError(f"Unknown MedMNIST flag: {flag}")
|
| 361 |
+
return MED_INFO[flag]
|
| 362 |
+
|
| 363 |
+
def _med_class_names(info: dict) -> List[str]:
|
| 364 |
+
lab = info["label"]
|
| 365 |
+
return [lab[str(i)] for i in range(len(lab))]
|
| 366 |
+
|
| 367 |
+
def load_single_dataset(name: str, split: str,
|
| 368 |
+
cfg: Optional[Dict]=None,
|
| 369 |
+
resolved_target_channels: Optional[int|str]=None
|
| 370 |
+
) -> Tuple[torch.utils.data.Dataset, int, List[str], int, int]:
|
| 371 |
+
"""
|
| 372 |
+
Return: dataset, num_classes, class_names, input_dim (C*H*W), output_channels
|
| 373 |
+
"""
|
| 374 |
+
cfg = cfg or config
|
| 375 |
+
name_key = name.strip()
|
| 376 |
+
name_lower = name_key.lower()
|
| 377 |
+
|
| 378 |
+
size = int(cfg.get("img_size", 28))
|
| 379 |
+
want_c = cfg.get("img_channels", "auto") if resolved_target_channels is None else resolved_target_channels
|
| 380 |
+
normalize = bool(cfg.get("normalize", True))
|
| 381 |
+
per_dataset_norm = bool(cfg.get("per_dataset_norm", True))
|
| 382 |
+
|
| 383 |
+
# TorchVision
|
| 384 |
+
if name_lower in _TORCHVISION_KEYS:
|
| 385 |
+
canonical = _TORCHVISION_KEYS[name_lower]
|
| 386 |
+
native_c = 1
|
| 387 |
+
transform = _build_transforms(canonical, split, native_c, want_c, size, normalize, per_dataset_norm)
|
| 388 |
+
|
| 389 |
+
if canonical == "MNIST":
|
| 390 |
+
ds = datasets.MNIST("./data", train=(split=="train"), transform=transform, download=True)
|
| 391 |
+
ncls = 10; cls_names = [f"digit-{i}" for i in range(10)]
|
| 392 |
+
elif canonical == "FashionMNIST":
|
| 393 |
+
base = ['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']
|
| 394 |
+
ds = datasets.FashionMNIST("./data", train=(split=="train"), transform=transform, download=True)
|
| 395 |
+
ncls = 10; cls_names = [f"fashion-{n}" for n in base]
|
| 396 |
+
elif canonical == "KMNIST":
|
| 397 |
+
ds = datasets.KMNIST("./data", train=(split=="train"), transform=transform, download=True)
|
| 398 |
+
ncls = 10; cls_names = [f"kmnist-{c}" for c in ['お','き','す','つ','な','は','ま','や','れ','を']]
|
| 399 |
+
elif canonical == "EMNIST":
|
| 400 |
+
ds = datasets.EMNIST("./data", split='balanced', train=(split=="train"), transform=transform, download=True)
|
| 401 |
+
letters = ['0','1','2','3','4','5','6','7','8','9',
|
| 402 |
+
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
|
| 403 |
+
'a','b','d','e','f','g','h','n','q','r','t']
|
| 404 |
+
ncls = 47; cls_names = [f"emnist-{c}" for c in letters]
|
| 405 |
+
elif canonical == "QMNIST":
|
| 406 |
+
ds = datasets.QMNIST("./data", what=('train' if split=="train" else 'test'), transform=transform, download=True)
|
| 407 |
+
ncls = 10; cls_names = [f"qmnist-{i}" for i in range(10)]
|
| 408 |
+
elif canonical == "USPS":
|
| 409 |
+
ds = datasets.USPS("./data", train=(split=="train"), transform=transform, download=True)
|
| 410 |
+
ncls = 10; cls_names = [f"usps-{i}" for i in range(10)]
|
| 411 |
+
else:
|
| 412 |
+
raise ValueError(f"Unhandled TorchVision dataset: {canonical}")
|
| 413 |
+
|
| 414 |
+
out_c = native_c if want_c == "auto" else int(want_c)
|
| 415 |
+
input_dim = out_c * size * size
|
| 416 |
+
return ds, ncls, cls_names, input_dim, out_c
|
| 417 |
+
|
| 418 |
+
# MedMNIST
|
| 419 |
+
if name_lower in _MEDMNIST_MAP:
|
| 420 |
+
if medmnist is None:
|
| 421 |
+
raise ImportError("medmnist not available. `pip install medmnist`")
|
| 422 |
+
flag = _MEDMNIST_MAP[name_lower]
|
| 423 |
+
info = _get_med_info(flag)
|
| 424 |
+
DataClass = getattr(medmnist, info["python_class"])
|
| 425 |
+
native_c = int(info.get("n_channels", 1))
|
| 426 |
+
out_c = native_c if want_c == "auto" else int(want_c)
|
| 427 |
+
|
| 428 |
+
transform = transforms.Compose([
|
| 429 |
+
transforms.ToTensor(),
|
| 430 |
+
_to_channels(out_c) if want_c != "auto" else transforms.Lambda(lambda t: t),
|
| 431 |
+
*(_augmentations_for(name_key, size, out_c) if (split=="train") else []),
|
| 432 |
+
transforms.Resize((size, size)) if size != 28 else transforms.Lambda(lambda t: t),
|
| 433 |
+
transforms.Normalize(*(_norm_stats(name_key, out_c) if (normalize and per_dataset_norm) else ((_MEAN1,_STD1) if out_c==1 else (_MEAN3,_STD3)))) if normalize else transforms.Lambda(lambda t: t),
|
| 434 |
+
transforms.Lambda(lambda t: t.view(-1)),
|
| 435 |
+
])
|
| 436 |
+
ds = DataClass(split=('train' if split=="train" else 'test'), transform=transform, download=True, size=size)
|
| 437 |
+
ncls = len(info["label"]); cls_names = _med_class_names(info)
|
| 438 |
+
input_dim = out_c * size * size
|
| 439 |
+
return ds, ncls, cls_names, input_dim, out_c
|
| 440 |
+
|
| 441 |
+
raise ValueError(f"Unknown dataset name: {name}")
|
| 442 |
+
|
| 443 |
+
def get_dataset_single(name: str, batch_size: int, num_workers: int = 2):
|
| 444 |
+
"""
|
| 445 |
+
Load a single dataset honoring config overrides.
|
| 446 |
+
Returns: train_loader, test_loader, num_classes, class_names, input_dim, channels
|
| 447 |
+
"""
|
| 448 |
+
tr, ntr, names_tr, in_tr, out_c = load_single_dataset(name, "train", config)
|
| 449 |
+
te, nte, names_te, in_te, out_c2 = load_single_dataset(name, "test", config)
|
| 450 |
+
assert ntr == nte and in_tr == in_te and out_c == out_c2
|
| 451 |
+
g = make_torch_generator(int(config.get("seed", 42)))
|
| 452 |
+
train_loader = DataLoader(
|
| 453 |
+
tr, batch_size=batch_size, shuffle=True, num_workers=num_workers,
|
| 454 |
+
pin_memory=torch.cuda.is_available(), collate_fn=collate_as_int,
|
| 455 |
+
worker_init_fn=seed_worker, generator=g, persistent_workers=False
|
| 456 |
+
)
|
| 457 |
+
test_loader = DataLoader(
|
| 458 |
+
te, batch_size=batch_size, shuffle=False, num_workers=num_workers,
|
| 459 |
+
pin_memory=torch.cuda.is_available(), collate_fn=collate_as_int,
|
| 460 |
+
worker_init_fn=seed_worker, generator=g, persistent_workers=False
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
return train_loader, test_loader, ntr, names_tr, in_tr, out_c
|
| 464 |
+
|
| 465 |
+
# Dataset catalogs
|
| 466 |
+
TORCHVISION_DATASETS = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST", "USPS"]
|
| 467 |
+
MEDMNIST_DATASETS = ["BloodMNIST","PathMNIST","OCTMNIST","PneumoniaMNIST","DermaMNIST",
|
| 468 |
+
"RetinaMNIST","BreastMNIST","OrganAMNIST","OrganCMNIST","OrganSMNIST","TissueMNIST"]
|
| 469 |
+
|
| 470 |
+
# ---------------------------------------------------------------------
|
| 471 |
+
# Models
|
| 472 |
+
# ---------------------------------------------------------------------
|
| 473 |
+
class PentaFreqExtractor(nn.Module):
|
| 474 |
+
"""
|
| 475 |
+
Multi-channel spectral extractor:
|
| 476 |
+
- Input: [B, C*H*W], unflatten -> [B, C, H, W]
|
| 477 |
+
- 5 frequency bands -> encode to base_dim each
|
| 478 |
+
"""
|
| 479 |
+
def __init__(self, input_dim: int = 784, input_ch: int = 1, base_dim: int = 64, channels: int = 12):
|
| 480 |
+
super().__init__()
|
| 481 |
+
self.input_dim = input_dim
|
| 482 |
+
self.input_ch = int(input_ch)
|
| 483 |
+
side_f = (input_dim / max(1, self.input_ch)) ** 0.5
|
| 484 |
+
side = int(side_f)
|
| 485 |
+
assert side * side * self.input_ch == input_dim, f"input_dim ({input_dim}) != C*H*W with H=W; C={self.input_ch}, side≈{side_f:.3f}"
|
| 486 |
+
self.unflatten = nn.Unflatten(1, (self.input_ch, side, side))
|
| 487 |
+
self.base_dim = base_dim
|
| 488 |
+
|
| 489 |
+
# Vertex 0 (ultra-high)
|
| 490 |
+
self.v0_ultrahigh = nn.Sequential(
|
| 491 |
+
nn.Conv2d(self.input_ch, channels, 3, padding=1),
|
| 492 |
+
nn.BatchNorm2d(channels), nn.ReLU(),
|
| 493 |
+
nn.Conv2d(channels, channels, 3, padding=1, groups=channels),
|
| 494 |
+
nn.BatchNorm2d(channels), nn.ReLU(),
|
| 495 |
+
nn.AdaptiveAvgPool2d(7), nn.Flatten()
|
| 496 |
+
); self.v0_encode = nn.Linear(channels * 49, base_dim)
|
| 497 |
+
|
| 498 |
+
# Vertex 1 (high)
|
| 499 |
+
self.v1_high = nn.Sequential(
|
| 500 |
+
nn.Conv2d(self.input_ch, channels, 3, padding=1),
|
| 501 |
+
nn.BatchNorm2d(channels), nn.Tanh(),
|
| 502 |
+
nn.MaxPool2d(2),
|
| 503 |
+
nn.Conv2d(channels, channels, 3, padding=1),
|
| 504 |
+
nn.BatchNorm2d(channels), nn.Tanh(),
|
| 505 |
+
nn.AdaptiveAvgPool2d(7), nn.Flatten()
|
| 506 |
+
); self.v1_encode = nn.Linear(channels * 49, base_dim)
|
| 507 |
+
|
| 508 |
+
# Vertex 2 (mid)
|
| 509 |
+
self.v2_mid = nn.Sequential(
|
| 510 |
+
nn.Conv2d(self.input_ch, channels, 5, padding=2, stride=2),
|
| 511 |
+
nn.BatchNorm2d(channels), nn.GELU(),
|
| 512 |
+
nn.Conv2d(channels, channels, 3, padding=1),
|
| 513 |
+
nn.BatchNorm2d(channels), nn.GELU(),
|
| 514 |
+
nn.AdaptiveAvgPool2d(7), nn.Flatten()
|
| 515 |
+
); self.v2_encode = nn.Linear(channels * 49, base_dim)
|
| 516 |
+
|
| 517 |
+
# Vertex 3 (low-mid)
|
| 518 |
+
self.v3_lowmid = nn.Sequential(
|
| 519 |
+
nn.AvgPool2d(2),
|
| 520 |
+
nn.Conv2d(self.input_ch, channels, 7, padding=3),
|
| 521 |
+
nn.BatchNorm2d(channels), nn.SiLU(),
|
| 522 |
+
nn.AdaptiveAvgPool2d(7), nn.Flatten()
|
| 523 |
+
); self.v3_encode = nn.Linear(channels * 49, base_dim)
|
| 524 |
+
|
| 525 |
+
# Vertex 4 (low)
|
| 526 |
+
self.v4_low = nn.Sequential(
|
| 527 |
+
nn.AvgPool2d(4),
|
| 528 |
+
nn.Conv2d(self.input_ch, channels, 7, padding=3),
|
| 529 |
+
nn.BatchNorm2d(channels), nn.Sigmoid(),
|
| 530 |
+
nn.AdaptiveAvgPool2d(7), nn.Flatten()
|
| 531 |
+
); self.v4_encode = nn.Linear(channels * 49, base_dim)
|
| 532 |
+
|
| 533 |
+
self.register_buffer("adjacency_matrix", torch.ones(5, 5) - torch.eye(5))
|
| 534 |
+
self._init_edge_kernels(channels)
|
| 535 |
+
|
| 536 |
+
@torch.no_grad()
|
| 537 |
+
def _init_edge_kernels(self, channels: int):
|
| 538 |
+
if channels < 5: return
|
| 539 |
+
conv0 = self.v0_ultrahigh[0]
|
| 540 |
+
if not isinstance(conv0, nn.Conv2d): return
|
| 541 |
+
if conv0.weight.shape[1] >= 1:
|
| 542 |
+
k = conv0.weight
|
| 543 |
+
k[0,0] = torch.tensor([[-1,0,1],[-2,0,2],[-1,0,1]], dtype=k.dtype)/4
|
| 544 |
+
k[1,0] = torch.tensor([[-1,-2,-1],[0,0,0],[1,2,1]], dtype=k.dtype)/4
|
| 545 |
+
k[2,0] = torch.tensor([[0,-1,0],[-1,4,-1],[0,-1,0]], dtype=k.dtype)/4
|
| 546 |
+
k[3,0] = torch.tensor([[1,0,0],[0,-1,0],[0,0,0]], dtype=k.dtype)/2
|
| 547 |
+
k[4,0] = torch.tensor([[-1,0,1],[-1,0,1],[-1,0,1]], dtype=k.dtype)/3
|
| 548 |
+
|
| 549 |
+
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 550 |
+
img = self.unflatten(x)
|
| 551 |
+
v0 = self.v0_encode(self.v0_ultrahigh(img))
|
| 552 |
+
v1 = self.v1_encode(self.v1_high(img))
|
| 553 |
+
v2 = self.v2_encode(self.v2_mid(img))
|
| 554 |
+
v3 = self.v3_encode(self.v3_lowmid(img))
|
| 555 |
+
v4 = self.v4_encode(self.v4_low(img))
|
| 556 |
+
vertices = torch.stack([v0, v1, v2, v3, v4], dim=1) # [B,5,D]
|
| 557 |
+
return vertices, self.adjacency_matrix
|
| 558 |
+
|
| 559 |
+
class PentachoronCrossAttention(nn.Module):
|
| 560 |
+
def __init__(self, dim: int, num_heads: int = 14, dropout: float = 0.0):
|
| 561 |
+
super().__init__()
|
| 562 |
+
self.attn = nn.MultiheadAttention(dim, num_heads=num_heads, dropout=dropout, batch_first=True)
|
| 563 |
+
def _row_to_attn_mask(self, row: torch.Tensor) -> torch.Tensor:
|
| 564 |
+
mask = torch.zeros(1, row.numel(), device=row.device, dtype=torch.float32)
|
| 565 |
+
mask[(row == 0).unsqueeze(0)] = float("-inf")
|
| 566 |
+
return mask
|
| 567 |
+
def forward(self, vertices: torch.Tensor, adjacency: torch.Tensor) -> torch.Tensor:
|
| 568 |
+
B, V, D = vertices.shape
|
| 569 |
+
outs = []
|
| 570 |
+
for i in range(V):
|
| 571 |
+
q = vertices[:, i:i+1, :]
|
| 572 |
+
k = v = vertices
|
| 573 |
+
mask = self._row_to_attn_mask(adjacency[i].to(vertices.device))
|
| 574 |
+
out, _ = self.attn(q, k, v, attn_mask=mask, need_weights=False)
|
| 575 |
+
outs.append(out)
|
| 576 |
+
return torch.cat(outs, dim=1)
|
| 577 |
+
|
| 578 |
+
class PentachoronOpinionFusion(nn.Module):
|
| 579 |
+
def __init__(self, base_dim: int = 64, proj_dim: Optional[int] = None, num_heads: int = 14, p_dropout: float = 0.2):
|
| 580 |
+
super().__init__()
|
| 581 |
+
self.cross = PentachoronCrossAttention(dim=base_dim, num_heads=num_heads)
|
| 582 |
+
self.fusion = nn.Sequential(
|
| 583 |
+
nn.Linear(base_dim * 5, base_dim * 3),
|
| 584 |
+
nn.BatchNorm1d(base_dim * 3), nn.ReLU(), nn.Dropout(p_dropout),
|
| 585 |
+
nn.Linear(base_dim * 3, base_dim * 2),
|
| 586 |
+
nn.BatchNorm1d(base_dim * 2), nn.ReLU(),
|
| 587 |
+
nn.Linear(base_dim * 2, base_dim),
|
| 588 |
+
)
|
| 589 |
+
self.projection = None if proj_dim is None else nn.Linear(base_dim, proj_dim, bias=False)
|
| 590 |
+
self._lambda_raw = nn.Parameter(torch.tensor(0.0))
|
| 591 |
+
|
| 592 |
+
@staticmethod
|
| 593 |
+
def _softmax_geometry(vertices: torch.Tensor, adjacency: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 594 |
+
v_norm = F.normalize(vertices, dim=2, eps=1e-8)
|
| 595 |
+
sims = torch.bmm(v_norm, v_norm.transpose(1, 2))
|
| 596 |
+
edge_strengths = sims * adjacency.to(vertices.dtype).unsqueeze(0)
|
| 597 |
+
weights = F.softmax(edge_strengths.sum(dim=2), dim=1) # [B,5]
|
| 598 |
+
weighted = vertices * weights.unsqueeze(2)
|
| 599 |
+
return weighted, weights
|
| 600 |
+
|
| 601 |
+
def forward(self, vertices: torch.Tensor, adjacency: torch.Tensor, return_diag: bool = False):
|
| 602 |
+
soft_out, weights = self._softmax_geometry(vertices, adjacency)
|
| 603 |
+
attn_out = self.cross(vertices, adjacency)
|
| 604 |
+
lam = torch.sigmoid(self._lambda_raw)
|
| 605 |
+
combined = lam * soft_out + (1.0 - lam) * attn_out
|
| 606 |
+
fused = self.fusion(combined.flatten(1))
|
| 607 |
+
if self.projection is not None:
|
| 608 |
+
fused = self.projection(fused)
|
| 609 |
+
z = F.normalize(fused, dim=1)
|
| 610 |
+
if not return_diag:
|
| 611 |
+
return z, None
|
| 612 |
+
return z, {"lambda": lam.detach(), "softmax_weights": weights.detach()}
|
| 613 |
+
|
| 614 |
+
class PentaFreqEncoderV2(nn.Module):
|
| 615 |
+
def __init__(self, input_dim: int = 784, input_ch: int = 1, base_dim: int = 64, proj_dim: Optional[int] = None, num_heads: int = 14, channels: int = 12):
|
| 616 |
+
super().__init__()
|
| 617 |
+
self.extractor = PentaFreqExtractor(input_dim=input_dim, input_ch=input_ch, base_dim=base_dim, channels=channels)
|
| 618 |
+
self.opinion = PentachoronOpinionFusion(base_dim=base_dim, proj_dim=proj_dim, num_heads=num_heads)
|
| 619 |
+
@torch.no_grad()
|
| 620 |
+
def get_frequency_contributions(self, x: torch.Tensor) -> torch.Tensor:
|
| 621 |
+
verts, adj = self.extractor(x)
|
| 622 |
+
_, w = self.opinion._softmax_geometry(verts, adj)
|
| 623 |
+
return w
|
| 624 |
+
def forward(self, x: torch.Tensor, return_diag: bool = False):
|
| 625 |
+
verts, adj = self.extractor(x)
|
| 626 |
+
z, diag = self.opinion(verts, adj, return_diag)
|
| 627 |
+
return (z, diag) if return_diag else z
|
| 628 |
+
|
| 629 |
+
class BatchedPentachoronConstellation(nn.Module):
|
| 630 |
+
def __init__(self, num_classes: int, dim: int, num_pairs: int = 5, device: Optional[torch.device] = None, lambda_sep: float = 0.5):
|
| 631 |
+
super().__init__()
|
| 632 |
+
self.num_classes = num_classes
|
| 633 |
+
self.dim = dim
|
| 634 |
+
self.num_pairs = num_pairs
|
| 635 |
+
self.device = device if device is not None else torch.device("cpu")
|
| 636 |
+
self.lambda_separation = lambda_sep
|
| 637 |
+
|
| 638 |
+
self.dispatchers = nn.Parameter(self._init_batched_pentachora())
|
| 639 |
+
self.specialists = nn.Parameter(self._init_batched_pentachora())
|
| 640 |
+
|
| 641 |
+
self.dispatcher_weights = nn.Parameter(torch.randn(num_pairs, 5) * 0.1)
|
| 642 |
+
self.specialist_weights = nn.Parameter(torch.randn(num_pairs, 5) * 0.1)
|
| 643 |
+
self.temps = nn.Parameter(0.3 * torch.ones(num_pairs))
|
| 644 |
+
|
| 645 |
+
self.register_buffer("vertex_map", self._create_vertex_mapping())
|
| 646 |
+
|
| 647 |
+
self.group_heads = nn.ModuleList([
|
| 648 |
+
nn.Linear(dim, int((self.vertex_map == i).sum().item())) if int((self.vertex_map == i).sum().item()) > 0 else None
|
| 649 |
+
for i in range(5)
|
| 650 |
+
])
|
| 651 |
+
|
| 652 |
+
self.cross_attention = nn.MultiheadAttention(embed_dim=dim, num_heads=14, dropout=0.1, batch_first=True)
|
| 653 |
+
self.aggregation_weights = nn.Parameter(torch.ones(num_pairs) / num_pairs)
|
| 654 |
+
|
| 655 |
+
self.fusion = nn.Sequential(
|
| 656 |
+
nn.Linear(num_classes * num_pairs, num_classes * 2),
|
| 657 |
+
nn.BatchNorm1d(num_classes * 2), nn.ReLU(), nn.Dropout(0.2),
|
| 658 |
+
nn.Linear(num_classes * 2, num_classes)
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
self.coherence_head = nn.Sequential(nn.Linear(dim, dim // 2), nn.GELU(), nn.Linear(dim // 2, 1))
|
| 662 |
+
|
| 663 |
+
def _init_batched_pentachora(self) -> torch.Tensor:
|
| 664 |
+
sqrt15, sqrt10, sqrt5 = np.sqrt(15), np.sqrt(10), np.sqrt(5)
|
| 665 |
+
base_simplex = torch.tensor([
|
| 666 |
+
[ 1.0, 0.0, 0.0, 0.0],
|
| 667 |
+
[-0.25, sqrt15/4, 0.0, 0.0],
|
| 668 |
+
[-0.25,-sqrt15/12, sqrt10/3, 0.0],
|
| 669 |
+
[-0.25,-sqrt15/12,-sqrt10/6, sqrt5/2],
|
| 670 |
+
[-0.25,-sqrt15/12,-sqrt10/6,-sqrt5/2]
|
| 671 |
+
], device=self.device, dtype=torch.float32)
|
| 672 |
+
base_simplex = F.normalize(base_simplex, dim=1)
|
| 673 |
+
pentachora = torch.zeros(self.num_pairs, 5, self.dim, device=self.device, dtype=torch.float32)
|
| 674 |
+
for i in range(self.num_pairs):
|
| 675 |
+
pentachora[i, :, :4] = base_simplex * (1 + 0.1 * i)
|
| 676 |
+
if self.dim > 4:
|
| 677 |
+
pentachora[i, :, 4:] = torch.randn(5, self.dim - 4, device=self.device) * (random.random() * 0.25)
|
| 678 |
+
return pentachora * 2.0
|
| 679 |
+
|
| 680 |
+
def _create_vertex_mapping(self) -> torch.Tensor:
|
| 681 |
+
mapping = torch.zeros(self.num_classes, dtype=torch.long)
|
| 682 |
+
for i in range(self.num_classes):
|
| 683 |
+
mapping[i] = i % 5
|
| 684 |
+
return mapping
|
| 685 |
+
|
| 686 |
+
def forward(self, x: torch.Tensor):
|
| 687 |
+
B = x.size(0)
|
| 688 |
+
coherence_gate = torch.sigmoid(self.coherence_head(x)) # [B,1]
|
| 689 |
+
|
| 690 |
+
x_exp = x.unsqueeze(1).unsqueeze(2) # [B,1,1,D]
|
| 691 |
+
disp_exp = self.dispatchers.unsqueeze(0) # [1,P,5,D]
|
| 692 |
+
spec_exp = self.specialists.unsqueeze(0) # [1,P,5,D]
|
| 693 |
+
disp_d = torch.norm(x_exp - disp_exp, dim=3) # [B,P,5]
|
| 694 |
+
spec_d = torch.norm(x_exp - spec_exp, dim=3) # [B,P,5]
|
| 695 |
+
|
| 696 |
+
dw = F.softmax(self.dispatcher_weights, dim=1).unsqueeze(0)
|
| 697 |
+
sw = F.softmax(self.specialist_weights, dim=1).unsqueeze(0)
|
| 698 |
+
temps = torch.clamp(self.temps, 0.1, 2.0).view(1, -1, 1)
|
| 699 |
+
|
| 700 |
+
disp_logits = -(disp_d * dw) / temps
|
| 701 |
+
spec_logits = -(spec_d * sw) / temps
|
| 702 |
+
|
| 703 |
+
c = coherence_gate.unsqueeze(-1)
|
| 704 |
+
disp_probs = F.softmax(disp_logits * c, dim=2)
|
| 705 |
+
spec_probs = F.softmax(spec_logits * c, dim=2)
|
| 706 |
+
probs = 0.5 * disp_probs + 0.5 * spec_probs
|
| 707 |
+
|
| 708 |
+
scores_by_pair = []
|
| 709 |
+
for p in range(self.num_pairs):
|
| 710 |
+
pair_scores = torch.zeros(B, self.num_classes, device=x.device)
|
| 711 |
+
for v_idx in range(5):
|
| 712 |
+
idxs = (self.vertex_map == v_idx).nonzero(as_tuple=True)[0]
|
| 713 |
+
if len(idxs) == 0: continue
|
| 714 |
+
v_prob = probs[:, p, v_idx:v_idx+1]
|
| 715 |
+
if self.group_heads[v_idx] is not None:
|
| 716 |
+
g_logits = self.group_heads[v_idx](x) # [B, |idxs|]
|
| 717 |
+
gated = g_logits * v_prob
|
| 718 |
+
for i, cls in enumerate(idxs.tolist()):
|
| 719 |
+
if i < gated.size(1):
|
| 720 |
+
pair_scores[:, cls] = gated[:, i]
|
| 721 |
+
scores_by_pair.append(pair_scores)
|
| 722 |
+
|
| 723 |
+
scores_tensor = torch.stack(scores_by_pair, dim=1) # [B,P,C]
|
| 724 |
+
|
| 725 |
+
centers = self.dispatchers.mean(dim=1).unsqueeze(0).expand(B, -1, -1)
|
| 726 |
+
_attn, _ = self.cross_attention(centers, centers, centers)
|
| 727 |
+
|
| 728 |
+
agg = F.softmax(self.aggregation_weights, dim=0).view(1, -1, 1)
|
| 729 |
+
weighted = (scores_tensor * agg).sum(dim=1) # [B,C]
|
| 730 |
+
fused = self.fusion(scores_tensor.flatten(1)) # [B,C]
|
| 731 |
+
final = 0.6 * weighted + 0.4 * fused
|
| 732 |
+
return final, {"disp_d": disp_d, "spec_d": spec_d, "probs": probs}
|
| 733 |
+
|
| 734 |
+
def _batched_cayley_menger(self, pentachora: torch.Tensor) -> torch.Tensor:
|
| 735 |
+
"""
|
| 736 |
+
Stable CM proxy: det(M) via eigvals of (M + eps*I).
|
| 737 |
+
Returns a positive scalar per cube; larger => more 'volumetric' (less degenerate).
|
| 738 |
+
"""
|
| 739 |
+
P = pentachora.shape[0]
|
| 740 |
+
d2 = torch.cdist(pentachora, pentachora).pow(2) # [P,5,5]
|
| 741 |
+
M = torch.zeros(P, 6, 6, device=self.device, dtype=pentachora.dtype)
|
| 742 |
+
M[:, 0, 1:] = 1.0
|
| 743 |
+
M[:, 1:, 0] = 1.0
|
| 744 |
+
M[:, 1:, 1:] = d2
|
| 745 |
+
|
| 746 |
+
eps = 1e-6
|
| 747 |
+
I = torch.eye(6, device=self.device, dtype=pentachora.dtype).unsqueeze(0)
|
| 748 |
+
M_eps = M + eps * I # make SPD-ish
|
| 749 |
+
# eigvalsh → real, sorted
|
| 750 |
+
evals = torch.linalg.eigvalsh(M_eps) # [P,6]
|
| 751 |
+
evals = evals.clamp_min(1e-12) # avoid log(<=0)
|
| 752 |
+
logdet = evals.log().sum(dim=1) # log|det|
|
| 753 |
+
det = torch.exp(logdet) # |det|
|
| 754 |
+
# keep it finite
|
| 755 |
+
det = torch.nan_to_num(det, nan=0.0, posinf=1e6, neginf=0.0)
|
| 756 |
+
return det
|
| 757 |
+
|
| 758 |
+
|
| 759 |
+
def _batched_edge_variance(self, pentachora: torch.Tensor) -> torch.Tensor:
|
| 760 |
+
d = torch.cdist(pentachora, pentachora)
|
| 761 |
+
mask = torch.triu(torch.ones(5, 5, device=pentachora.device), diagonal=1).bool()
|
| 762 |
+
edges = torch.stack([d[p][mask] for p in range(self.num_pairs)]) # [P,10]
|
| 763 |
+
return edges.var(dim=1).sum() + torch.relu(0.5 - edges.min(dim=1)[0]).sum()
|
| 764 |
+
|
| 765 |
+
def regularization_loss(self, vertex_weights=None) -> torch.Tensor:
|
| 766 |
+
disp_cm = self._batched_cayley_menger(self.dispatchers)
|
| 767 |
+
spec_cm = self._batched_cayley_menger(self.specialists)
|
| 768 |
+
cm_loss = torch.relu(1.0 - torch.abs(disp_cm)).sum() + torch.relu(1.0 - torch.abs(spec_cm)).sum()
|
| 769 |
+
edge_loss = self._batched_edge_variance(self.dispatchers) + self._batched_edge_variance(self.specialists)
|
| 770 |
+
disp_centers = self.dispatchers.mean(dim=1)
|
| 771 |
+
spec_centers = self.specialists.mean(dim=1)
|
| 772 |
+
cos_sims = F.cosine_similarity(disp_centers, spec_centers, dim=1, eps=1e-8)
|
| 773 |
+
ortho = torch.abs(cos_sims).sum() * self.lambda_separation
|
| 774 |
+
separations = torch.norm(disp_centers - spec_centers, dim=1)
|
| 775 |
+
sep = torch.relu(2.0 - separations).sum() * self.lambda_separation
|
| 776 |
+
|
| 777 |
+
dyn = 0.0
|
| 778 |
+
if vertex_weights is not None:
|
| 779 |
+
vw = vertex_weights.to(self.dispatchers.device)
|
| 780 |
+
disp_norms = torch.norm(self.dispatchers, p=2, dim=2)
|
| 781 |
+
spec_norms = torch.norm(self.specialists, p=2, dim=2)
|
| 782 |
+
dyn = 0.1 * ((disp_norms * vw.unsqueeze(0)).mean() + (spec_norms * vw.unsqueeze(0)).mean())
|
| 783 |
+
|
| 784 |
+
return (cm_loss + edge_loss + ortho + sep) / self.num_pairs + dyn
|
| 785 |
+
|
| 786 |
+
# ---------------------------------------------------------------------
|
| 787 |
+
# Losses
|
| 788 |
+
# ---------------------------------------------------------------------
|
| 789 |
+
def dual_contrastive_loss(latents, targets, constellation, temp: float):
|
| 790 |
+
B = latents.size(0)
|
| 791 |
+
z = F.normalize(latents, dim=1, eps=1e-8)
|
| 792 |
+
disp = F.normalize(constellation.dispatchers, dim=2, eps=1e-8)
|
| 793 |
+
spec = F.normalize(constellation.specialists, dim=2, eps=1e-8)
|
| 794 |
+
disp_logits = torch.einsum('bd,pvd->bpv', z, disp) / temp
|
| 795 |
+
spec_logits = torch.einsum('bd,pvd->bpv', z, spec) / temp
|
| 796 |
+
tvert = constellation.vertex_map[targets]
|
| 797 |
+
idx = tvert.view(B, 1, 1).expand(B, disp_logits.size(1), 1)
|
| 798 |
+
disp_pos = disp_logits.gather(2, idx).squeeze(2)
|
| 799 |
+
spec_pos = spec_logits.gather(2, idx).squeeze(2)
|
| 800 |
+
disp_lse = torch.logsumexp(disp_logits, dim=2)
|
| 801 |
+
spec_lse = torch.logsumexp(spec_logits, dim=2)
|
| 802 |
+
return (disp_lse - disp_pos).mean() + (spec_lse - spec_pos).mean()
|
| 803 |
+
|
| 804 |
+
class RoseDiagnosticHead(nn.Module):
|
| 805 |
+
def __init__(self, latent_dim: int, hidden_dim: int = 128):
|
| 806 |
+
super().__init__()
|
| 807 |
+
self.net = nn.Sequential(nn.Linear(latent_dim, hidden_dim), nn.GELU(), nn.LayerNorm(hidden_dim), nn.Linear(hidden_dim, 1))
|
| 808 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 809 |
+
return self.net(x)
|
| 810 |
+
|
| 811 |
+
def rose_score_magnitude(x, need, relation, purpose, eps: float = 1e-8):
|
| 812 |
+
x_n = F.normalize(x, dim=-1, eps=eps)
|
| 813 |
+
n_n = F.normalize(need, dim=-1, eps=eps)
|
| 814 |
+
r_n = F.normalize(relation, dim=-1, eps=eps)
|
| 815 |
+
p_n = F.normalize(purpose, dim=-1, eps=eps)
|
| 816 |
+
r7 = ((x_n*n_n).sum(-1) + (x_n*r_n).sum(-1) + (x_n*p_n).sum(-1)) / 3.0
|
| 817 |
+
r8 = x.norm(dim=-1).clamp_min(eps)
|
| 818 |
+
return r7 * r8
|
| 819 |
+
|
| 820 |
+
def rose_contrastive_loss(latents, targets, constellation, temp: float = 0.5):
|
| 821 |
+
B, D = latents.shape
|
| 822 |
+
tvert = constellation.vertex_map[targets]
|
| 823 |
+
need = constellation.specialists[:, tvert, :].mean(dim=0)
|
| 824 |
+
relation = constellation.dispatchers[:, tvert, :].mean(dim=0)
|
| 825 |
+
purpose = constellation.specialists.mean(dim=(0, 1)).unsqueeze(0).expand(B, D)
|
| 826 |
+
rose = rose_score_magnitude(latents, need, relation, purpose)
|
| 827 |
+
weights = (1.0 - torch.tanh(rose)).detach()
|
| 828 |
+
spec = F.normalize(constellation.specialists.mean(dim=0), dim=1, eps=1e-8)
|
| 829 |
+
disp = F.normalize(constellation.dispatchers.mean(dim=0), dim=1, eps=1e-8)
|
| 830 |
+
z = F.normalize(latents, dim=1, eps=1e-8)
|
| 831 |
+
spec_logits = (z @ spec.T) / temp
|
| 832 |
+
disp_logits = (z @ disp.T) / temp
|
| 833 |
+
spec_pos = spec_logits.gather(1, tvert.view(-1,1)).squeeze(1)
|
| 834 |
+
disp_pos = disp_logits.gather(1, tvert.view(-1,1)).squeeze(1)
|
| 835 |
+
spec_lse = torch.logsumexp(spec_logits, dim=1)
|
| 836 |
+
disp_lse = torch.logsumexp(disp_logits, dim=1)
|
| 837 |
+
per_sample = 0.5 * ((spec_lse - spec_pos) + (disp_lse - disp_pos))
|
| 838 |
+
return (per_sample * weights).mean(), rose.detach()
|
| 839 |
+
|
| 840 |
+
# ---------------------------------------------------------------------
|
| 841 |
+
# Regularization helpers
|
| 842 |
+
# ---------------------------------------------------------------------
|
| 843 |
+
def get_class_similarity(constellation_model: BatchedPentachoronConstellation, num_classes: int) -> torch.Tensor:
|
| 844 |
+
W = constellation_model.fusion[-1].weight.data.detach()
|
| 845 |
+
Wn = F.normalize(W, p=2, dim=1)
|
| 846 |
+
return torch.clamp(Wn @ Wn.T, 0.0, 1.0)
|
| 847 |
+
|
| 848 |
+
def vertex_weights_from_confusion(cm: np.ndarray, class_similarity: torch.Tensor, vertex_map: torch.Tensor, device: torch.device) -> torch.Tensor:
|
| 849 |
+
C = cm.shape[0]
|
| 850 |
+
totals = cm.sum(axis=1)
|
| 851 |
+
correct = cm.diagonal()
|
| 852 |
+
acc = np.divide(correct, totals, out=np.zeros_like(correct, dtype=float), where=totals != 0)
|
| 853 |
+
confusion_scores = 1.0 - torch.tensor(acc, device=device, dtype=torch.float32)
|
| 854 |
+
sigma = 0.5
|
| 855 |
+
gaussian = torch.exp(-((1 - class_similarity) ** 2) / (2 * sigma ** 2))
|
| 856 |
+
propagated = gaussian @ confusion_scores
|
| 857 |
+
v_sum = torch.zeros(5, device=device); v_cnt = torch.zeros(5, device=device)
|
| 858 |
+
for cls, v in enumerate(vertex_map.tolist()):
|
| 859 |
+
v_sum[v] += propagated[cls]; v_cnt[v] += 1
|
| 860 |
+
v_avg = torch.zeros_like(v_sum); mask = v_cnt > 0; v_avg[mask] = v_sum[mask] / v_cnt[mask]
|
| 861 |
+
vw = 1.0 - torch.tanh(v_avg)
|
| 862 |
+
return F.normalize(vw, p=1, dim=0) * 5.0
|
| 863 |
+
|
| 864 |
+
# ---------------------------------------------------------------------
|
| 865 |
+
# Evaluate / Train
|
| 866 |
+
# ---------------------------------------------------------------------
|
| 867 |
+
def evaluate(encoder: nn.Module, constellation: nn.Module, loader, num_classes: int, device: torch.device, collect_diag: bool = False):
|
| 868 |
+
encoder.eval(); constellation.eval()
|
| 869 |
+
all_preds, all_targets = [], []
|
| 870 |
+
lambda_vals = []
|
| 871 |
+
soft_w_sums = torch.zeros(5, device=device)
|
| 872 |
+
soft_w_count = 0
|
| 873 |
+
with torch.no_grad():
|
| 874 |
+
for x, y in tqdm(loader, desc="Evaluating"):
|
| 875 |
+
x, y = x.to(device), y.to(device)
|
| 876 |
+
if collect_diag:
|
| 877 |
+
z, diag = encoder(x, return_diag=True)
|
| 878 |
+
w = diag["softmax_weights"]
|
| 879 |
+
soft_w_sums += w.sum(dim=0); soft_w_count += w.size(0)
|
| 880 |
+
else:
|
| 881 |
+
z = encoder(x)
|
| 882 |
+
logits, _ = constellation(z)
|
| 883 |
+
preds = logits.argmax(dim=1)
|
| 884 |
+
all_preds.append(preds.cpu().numpy())
|
| 885 |
+
all_targets.append(y.cpu().numpy())
|
| 886 |
+
if hasattr(encoder, "opinion") and hasattr(encoder.opinion, "_lambda_raw"):
|
| 887 |
+
lambda_vals.append(float(torch.sigmoid(encoder.opinion._lambda_raw).item()))
|
| 888 |
+
all_preds = np.concatenate(all_preds); all_targets = np.concatenate(all_targets)
|
| 889 |
+
acc = float((all_preds == all_targets).mean())
|
| 890 |
+
cm = confusion_matrix(all_targets, all_preds, labels=np.arange(num_classes))
|
| 891 |
+
per_class = np.divide(cm.diagonal(), cm.sum(axis=1), out=np.zeros(num_classes), where=cm.sum(axis=1)!=0)
|
| 892 |
+
avg_soft_w = (soft_w_sums / soft_w_count).detach().cpu().numpy() if (collect_diag and soft_w_count > 0) else None
|
| 893 |
+
lam_eval = float(np.mean(lambda_vals)) if lambda_vals else None
|
| 894 |
+
return acc, per_class.tolist(), cm, avg_soft_w, lam_eval
|
| 895 |
+
|
| 896 |
+
def _adapt_pairs_by_classes(cfg: Dict, num_classes: int) -> int:
|
| 897 |
+
# Keep ~<=10 classes per vertex group across pairs
|
| 898 |
+
pairs = cfg.get("num_pentachoron_pairs", 1)
|
| 899 |
+
target = max(1, int(math.ceil(num_classes / 10)))
|
| 900 |
+
return max(pairs, target)
|
| 901 |
+
|
| 902 |
+
def train_one(
|
| 903 |
+
train_loader,
|
| 904 |
+
test_loader,
|
| 905 |
+
num_classes: int,
|
| 906 |
+
cfg: dict,
|
| 907 |
+
device: torch.device,
|
| 908 |
+
writer: SummaryWriter,
|
| 909 |
+
class_names: Optional[list] = None,
|
| 910 |
+
):
|
| 911 |
+
pairs = _adapt_pairs_by_classes(cfg, num_classes)
|
| 912 |
+
if pairs != cfg.get("num_pentachoron_pairs"):
|
| 913 |
+
print(f"[auto] Adjusting num_pentachoron_pairs -> {pairs} for {num_classes} classes.")
|
| 914 |
+
cfg_local = dict(cfg); cfg_local["num_pentachoron_pairs"] = pairs
|
| 915 |
+
|
| 916 |
+
encoder = PentaFreqEncoderV2(
|
| 917 |
+
input_dim=cfg_local["input_dim"],
|
| 918 |
+
input_ch=cfg_local.get("input_channels", 1),
|
| 919 |
+
base_dim=cfg_local["base_dim"],
|
| 920 |
+
proj_dim=None,
|
| 921 |
+
num_heads=cfg_local.get("num_heads", 14),
|
| 922 |
+
channels=cfg_local.get("channels", 12),
|
| 923 |
+
).to(device)
|
| 924 |
+
|
| 925 |
+
constellation = BatchedPentachoronConstellation(
|
| 926 |
+
num_classes=num_classes,
|
| 927 |
+
dim=cfg_local["base_dim"],
|
| 928 |
+
num_pairs=cfg_local["num_pentachoron_pairs"],
|
| 929 |
+
device=device,
|
| 930 |
+
lambda_sep=cfg_local["lambda_separation"],
|
| 931 |
+
).to(device)
|
| 932 |
+
|
| 933 |
+
diag_head = RoseDiagnosticHead(cfg_local["base_dim"]).to(device)
|
| 934 |
+
|
| 935 |
+
params = list(encoder.parameters()) + list(constellation.parameters()) + list(diag_head.parameters())
|
| 936 |
+
optim = torch.optim.AdamW(params, lr=cfg_local["lr"], weight_decay=cfg_local["weight_decay"])
|
| 937 |
+
lr_sched = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=cfg_local["epochs"])
|
| 938 |
+
|
| 939 |
+
w_ce = float(cfg_local.get("w_ce", 1.0))
|
| 940 |
+
w_dual = float(cfg_local.get("w_dual", 1.0))
|
| 941 |
+
w_rose = float(cfg_local.get("w_rose", 1.0))
|
| 942 |
+
w_diag = float(cfg_local.get("w_diag", 0.1))
|
| 943 |
+
w_reg = float(cfg_local.get("w_reg", cfg_local["loss_weight_scalar"]))
|
| 944 |
+
|
| 945 |
+
history = {"train_loss": [], "train_acc": [], "test_acc": [], "ce": [], "dual": [], "rose": [], "diag": [], "reg": [], "lambda": []}
|
| 946 |
+
best = {"acc": 0.0, "cm": None, "epoch": -1}
|
| 947 |
+
vertex_weights = None
|
| 948 |
+
|
| 949 |
+
global_step = 0
|
| 950 |
+
for epoch in range(cfg_local["epochs"]):
|
| 951 |
+
encoder.train(); constellation.train(); diag_head.train()
|
| 952 |
+
sum_loss = sum_ce = sum_dual = sum_rose = sum_diag = sum_reg = 0.0
|
| 953 |
+
correct = total = 0
|
| 954 |
+
|
| 955 |
+
pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg_local['epochs']} [Train]")
|
| 956 |
+
for x, y in pbar:
|
| 957 |
+
x, y = x.to(device), y.to(device)
|
| 958 |
+
optim.zero_grad()
|
| 959 |
+
|
| 960 |
+
z = encoder(x)
|
| 961 |
+
logits, _ = constellation(z)
|
| 962 |
+
|
| 963 |
+
l_ce = F.cross_entropy(logits, y)
|
| 964 |
+
l_dual = dual_contrastive_loss(z, y, constellation, temp=cfg_local["temp"])
|
| 965 |
+
l_rose, rose_scores = rose_contrastive_loss(z, y, constellation, temp=cfg_local["temp"])
|
| 966 |
+
pred_rose = diag_head(z.detach()).squeeze(-1)
|
| 967 |
+
l_diag = F.mse_loss(pred_rose, rose_scores)
|
| 968 |
+
l_reg = constellation.regularization_loss(vertex_weights=vertex_weights)
|
| 969 |
+
|
| 970 |
+
loss = (w_ce*l_ce) + (w_dual*l_dual) + (w_rose*l_rose) + (w_diag*l_diag) + (w_reg*l_reg)
|
| 971 |
+
|
| 972 |
+
# after computing l_ce, l_dual, l_rose, l_diag, l_reg and loss
|
| 973 |
+
if not torch.isfinite(l_ce) or not torch.isfinite(l_dual) \
|
| 974 |
+
or not torch.isfinite(l_rose) or not torch.isfinite(l_diag) \
|
| 975 |
+
or not torch.isfinite(l_reg) or not torch.isfinite(loss):
|
| 976 |
+
print("[NaN-guard] non-finite detected. Skipping step. "
|
| 977 |
+
f"ce={l_ce.item() if torch.isfinite(l_ce) else 'nan'}, "
|
| 978 |
+
f"dual={l_dual.item() if torch.isfinite(l_dual) else 'nan'}, "
|
| 979 |
+
f"rose={l_rose.item() if torch.isfinite(l_rose) else 'nan'}, "
|
| 980 |
+
f"reg={l_reg.item() if torch.isfinite(l_reg) else 'nan'}")
|
| 981 |
+
|
| 982 |
+
# Soft defuse: drop LR a bit for stability
|
| 983 |
+
for g in optim.param_groups:
|
| 984 |
+
g["lr"] = max(g["lr"] * 0.5, 1e-6)
|
| 985 |
+
|
| 986 |
+
optim.zero_grad(set_to_none=True)
|
| 987 |
+
# Optional: clip parameter norms right now to kill accidental blow-ups
|
| 988 |
+
with torch.no_grad():
|
| 989 |
+
for p in list(encoder.parameters()) + list(constellation.parameters()):
|
| 990 |
+
if torch.isfinite(p).all():
|
| 991 |
+
p.clamp_(-1e3, 1e3)
|
| 992 |
+
continue
|
| 993 |
+
loss.backward()
|
| 994 |
+
torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
|
| 995 |
+
torch.nn.utils.clip_grad_norm_(constellation.parameters(), 1.0)
|
| 996 |
+
optim.step()
|
| 997 |
+
|
| 998 |
+
bs = x.size(0)
|
| 999 |
+
sum_loss += loss.item() * bs
|
| 1000 |
+
sum_ce += l_ce.item() * bs
|
| 1001 |
+
sum_dual += l_dual.item() * bs
|
| 1002 |
+
sum_rose += l_rose.item() * bs
|
| 1003 |
+
sum_diag += l_diag.item() * bs
|
| 1004 |
+
sum_reg += l_reg.item() * bs
|
| 1005 |
+
|
| 1006 |
+
preds = logits.argmax(dim=1)
|
| 1007 |
+
correct += (preds == y).sum().item()
|
| 1008 |
+
total += bs
|
| 1009 |
+
|
| 1010 |
+
# TB (step)
|
| 1011 |
+
writer.add_scalar("step/loss", loss.item(), global_step)
|
| 1012 |
+
writer.add_scalar("step/ce", l_ce.item(), global_step)
|
| 1013 |
+
writer.add_scalar("step/dual", l_dual.item(), global_step)
|
| 1014 |
+
writer.add_scalar("step/rose", l_rose.item(), global_step)
|
| 1015 |
+
writer.add_scalar("step/diag", l_diag.item(), global_step)
|
| 1016 |
+
writer.add_scalar("step/reg", l_reg.item(), global_step)
|
| 1017 |
+
global_step += 1
|
| 1018 |
+
|
| 1019 |
+
pbar.set_postfix({
|
| 1020 |
+
"loss": f"{loss.item():.4f}",
|
| 1021 |
+
"acc": f"{correct/max(1,total):.4f}",
|
| 1022 |
+
"ce": f"{l_ce.item():.3f}",
|
| 1023 |
+
"dual": f"{l_dual.item():.3f}",
|
| 1024 |
+
"rose": f"{l_rose.item():.3f}",
|
| 1025 |
+
"reg": f"{l_reg.item():.3f}",
|
| 1026 |
+
})
|
| 1027 |
+
|
| 1028 |
+
train_loss = sum_loss / max(1, total)
|
| 1029 |
+
train_acc = correct / max(1, total)
|
| 1030 |
+
history["train_loss"].append(train_loss)
|
| 1031 |
+
history["train_acc"].append(train_acc)
|
| 1032 |
+
history["ce"].append(sum_ce / max(1,total))
|
| 1033 |
+
history["dual"].append(sum_dual / max(1,total))
|
| 1034 |
+
history["rose"].append(sum_rose / max(1,total))
|
| 1035 |
+
history["diag"].append(sum_diag / max(1,total))
|
| 1036 |
+
history["reg"].append(sum_reg / max(1,total))
|
| 1037 |
+
|
| 1038 |
+
# Eval
|
| 1039 |
+
test_acc, per_class_acc, cm, avg_soft_w, lam_eval = evaluate(
|
| 1040 |
+
encoder, constellation, test_loader, num_classes, device, collect_diag=True
|
| 1041 |
+
)
|
| 1042 |
+
history["test_acc"].append(test_acc)
|
| 1043 |
+
if lam_eval is not None:
|
| 1044 |
+
history["lambda"].append(lam_eval)
|
| 1045 |
+
else:
|
| 1046 |
+
history["lambda"].append(float(torch.sigmoid(encoder.opinion._lambda_raw).item())
|
| 1047 |
+
if hasattr(encoder, "opinion") else 0.5)
|
| 1048 |
+
|
| 1049 |
+
lr_sched.step()
|
| 1050 |
+
|
| 1051 |
+
# TB (epoch)
|
| 1052 |
+
writer.add_scalar("epoch/train_loss", train_loss, epoch+1)
|
| 1053 |
+
writer.add_scalar("epoch/train_acc", train_acc, epoch+1)
|
| 1054 |
+
writer.add_scalar("epoch/test_acc", test_acc, epoch+1)
|
| 1055 |
+
writer.add_scalar("epoch/lr", optim.param_groups[0]["lr"], epoch+1)
|
| 1056 |
+
writer.add_scalar("epoch/lambda", history["lambda"][-1], epoch+1)
|
| 1057 |
+
|
| 1058 |
+
print(f"\n[Epoch {epoch+1}/{cfg_local['epochs']}] "
|
| 1059 |
+
f"TrainLoss {train_loss:.4f} | TrainAcc {train_acc:.4f} | TestAcc {test_acc:.4f} | "
|
| 1060 |
+
f"CE {history['ce'][-1]:.3f} Dual {history['dual'][-1]:.3f} "
|
| 1061 |
+
f"ROSE {history['rose'][-1]:.3f} Reg {history['reg'][-1]:.3f} λ {history['lambda'][-1]:.3f}")
|
| 1062 |
+
|
| 1063 |
+
# Update reg weights
|
| 1064 |
+
with torch.no_grad():
|
| 1065 |
+
class_sim = get_class_similarity(constellation, num_classes).to(device)
|
| 1066 |
+
vertex_weights = vertex_weights_from_confusion(cm, class_sim, constellation.vertex_map, device)
|
| 1067 |
+
|
| 1068 |
+
if test_acc > best["acc"]:
|
| 1069 |
+
best["acc"], best["cm"], best["epoch"] = test_acc, cm, epoch+1
|
| 1070 |
+
print(f" 🎯 New Best Acc: {best['acc']:.4f} at epoch {best['epoch']}")
|
| 1071 |
+
|
| 1072 |
+
# Optional confusion per epoch
|
| 1073 |
+
try:
|
| 1074 |
+
import matplotlib.pyplot as plt
|
| 1075 |
+
import seaborn as sns
|
| 1076 |
+
os.makedirs("plots", exist_ok=True)
|
| 1077 |
+
plt.figure(figsize=(9, 7))
|
| 1078 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
| 1079 |
+
xticklabels=class_names, yticklabels=class_names)
|
| 1080 |
+
plt.title(f'Confusion (Epoch {epoch+1}) | Acc: {test_acc:.4f}')
|
| 1081 |
+
plt.xlabel('Predicted'); plt.ylabel('True'); plt.tight_layout()
|
| 1082 |
+
plt.savefig(f'plots/confusion_epoch_{epoch+1}.png', dpi=150)
|
| 1083 |
+
plt.close()
|
| 1084 |
+
except Exception as e:
|
| 1085 |
+
print(f"(Confusion heatmap skipped this epoch: {e})")
|
| 1086 |
+
|
| 1087 |
+
return encoder, constellation, diag_head, history, best
|
| 1088 |
+
|
| 1089 |
+
# ---------------------------------------------------------------------
|
| 1090 |
+
# Plots (local convenience; artifacts already keep TB)
|
| 1091 |
+
# ---------------------------------------------------------------------
|
| 1092 |
+
def plot_history(history: dict, outdir: str = "plots"):
|
| 1093 |
+
os.makedirs(outdir, exist_ok=True)
|
| 1094 |
+
import matplotlib.pyplot as plt
|
| 1095 |
+
plt.figure(figsize=(10,5))
|
| 1096 |
+
plt.plot(history['train_acc'], label='Train Acc')
|
| 1097 |
+
plt.plot(history['test_acc'], label='Test Acc')
|
| 1098 |
+
plt.title('Accuracy over Epochs'); plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.legend(); plt.grid(True, ls='--', alpha=0.4)
|
| 1099 |
+
plt.tight_layout(); plt.savefig(f"{outdir}/accuracy.png", dpi=150); plt.close()
|
| 1100 |
+
|
| 1101 |
+
plt.figure(figsize=(10,5))
|
| 1102 |
+
plt.plot(history['train_loss'], label='Total')
|
| 1103 |
+
plt.plot(history['ce'], label='CE')
|
| 1104 |
+
plt.plot(history['dual'], label='DualNCE')
|
| 1105 |
+
plt.plot(history['rose'], label='ROSE')
|
| 1106 |
+
plt.plot(history['reg'], label='Reg')
|
| 1107 |
+
plt.title('Loss Components'); plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.grid(True, ls='--', alpha=0.4)
|
| 1108 |
+
plt.tight_layout(); plt.savefig(f"{outdir}/loss_components.png", dpi=150); plt.close()
|
| 1109 |
+
|
| 1110 |
+
plt.figure(figsize=(8,4))
|
| 1111 |
+
plt.plot(history['lambda'])
|
| 1112 |
+
plt.title('λ (Geometry ↔ Attention Gate)'); plt.xlabel('Epoch'); plt.ylabel('λ'); plt.grid(True, ls='--', alpha=0.4)
|
| 1113 |
+
plt.tight_layout(); plt.savefig(f"{outdir}/lambda.png", dpi=150); plt.close()
|
| 1114 |
+
|
| 1115 |
+
def plot_confusion(cm: np.ndarray, class_names: list, outpath: str):
|
| 1116 |
+
import matplotlib.pyplot as plt
|
| 1117 |
+
try:
|
| 1118 |
+
import seaborn as sns
|
| 1119 |
+
plt.figure(figsize=(10,8))
|
| 1120 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
| 1121 |
+
xticklabels=class_names, yticklabels=class_names)
|
| 1122 |
+
plt.title('Best Confusion Matrix'); plt.xlabel('Predicted'); plt.ylabel('True')
|
| 1123 |
+
plt.tight_layout(); plt.savefig(outpath, dpi=150); plt.close()
|
| 1124 |
+
except Exception:
|
| 1125 |
+
plt.figure(figsize=(10,8))
|
| 1126 |
+
plt.imshow(cm, aspect='auto'); plt.title('Best Confusion Matrix')
|
| 1127 |
+
plt.xlabel('Predicted'); plt.ylabel('True'); plt.colorbar()
|
| 1128 |
+
plt.tight_layout(); plt.savefig(outpath, dpi=150); plt.close()
|
| 1129 |
+
|
| 1130 |
+
|
| 1131 |
+
def _sanitize_for_path(s: str, preserve_case: bool = True) -> str:
|
| 1132 |
+
"""Keep letters/digits/._- ; replace others with '_'."""
|
| 1133 |
+
out = []
|
| 1134 |
+
for ch in (s if preserve_case else s.lower()):
|
| 1135 |
+
if ch.isalnum() or ch in "._-":
|
| 1136 |
+
out.append(ch)
|
| 1137 |
+
else:
|
| 1138 |
+
out.append("_")
|
| 1139 |
+
return "".join(out)
|
| 1140 |
+
|
| 1141 |
+
def _build_hf_paths(dataset_name: str, cfg: Dict, ts: str) -> Dict[str, str]:
|
| 1142 |
+
pres = bool(cfg.get("hf_preserve_case", True))
|
| 1143 |
+
dataset_disp = dataset_name if pres else dataset_name.lower()
|
| 1144 |
+
dataset_token = _sanitize_for_path(dataset_disp, preserve_case=pres)
|
| 1145 |
+
slug = _dataset_slug(dataset_name) # lowercase with '+' for sweeps (kept for convenience)
|
| 1146 |
+
|
| 1147 |
+
root = cfg.get("hf_subdir_root", "pentachora-adaptive-encoded").strip("/")
|
| 1148 |
+
|
| 1149 |
+
# templates allow {dataset}, {slug}, {ts}
|
| 1150 |
+
dtempl = cfg.get("hf_dataset_dir_template", "{dataset}")
|
| 1151 |
+
rtempl = cfg.get("hf_run_dir_template", "{ts}_{dataset}")
|
| 1152 |
+
|
| 1153 |
+
dataset_dir = dtempl.format(dataset=dataset_token, slug=slug, ts=ts)
|
| 1154 |
+
run_dir = rtempl.format(dataset=dataset_token, slug=slug, ts=ts)
|
| 1155 |
+
|
| 1156 |
+
path_in_repo = f"{root}/{dataset_dir}/{run_dir}".strip("/")
|
| 1157 |
+
local_root = Path("artifacts") / root / dataset_dir / run_dir
|
| 1158 |
+
|
| 1159 |
+
return {
|
| 1160 |
+
"dataset_token": dataset_token,
|
| 1161 |
+
"path_in_repo": path_in_repo,
|
| 1162 |
+
"local_root": str(local_root),
|
| 1163 |
+
}
|
| 1164 |
+
|
| 1165 |
+
|
| 1166 |
+
|
| 1167 |
+
def save_and_push_artifacts(
|
| 1168 |
+
*,
|
| 1169 |
+
encoder: nn.Module,
|
| 1170 |
+
constellation: nn.Module,
|
| 1171 |
+
diag_head: nn.Module,
|
| 1172 |
+
config: Dict,
|
| 1173 |
+
class_names: List[str],
|
| 1174 |
+
history: Dict,
|
| 1175 |
+
best: Dict,
|
| 1176 |
+
tb_log_dir: Path,
|
| 1177 |
+
dataset_names: List[str], # pass a single dataset name here
|
| 1178 |
+
):
|
| 1179 |
+
assert len(dataset_names) == 1, "Pass a single dataset name to save_and_push_artifacts"
|
| 1180 |
+
dataset_name = dataset_names[0]
|
| 1181 |
+
|
| 1182 |
+
ts = _timestamp()
|
| 1183 |
+
repo_id = _resolve_repo_id(config)
|
| 1184 |
+
_hf_login_if_needed()
|
| 1185 |
+
api = _ensure_repo(repo_id)
|
| 1186 |
+
|
| 1187 |
+
paths = _build_hf_paths(dataset_name, config, ts)
|
| 1188 |
+
base_out = Path(paths["local_root"])
|
| 1189 |
+
base_out.mkdir(parents=True, exist_ok=True)
|
| 1190 |
+
|
| 1191 |
+
# Weight file naming
|
| 1192 |
+
ds_token = paths["dataset_token"]
|
| 1193 |
+
suffix = f"_{ds_token}" if bool(config.get("hf_weight_suffix_dataset", True)) else ""
|
| 1194 |
+
|
| 1195 |
+
wdir = base_out / "weights"; wdir.mkdir(parents=True, exist_ok=True)
|
| 1196 |
+
save_safetensors({k: v.cpu() for k, v in encoder.state_dict().items()}, str(wdir / f"encoder{suffix}.safetensors"))
|
| 1197 |
+
save_safetensors({k: v.cpu() for k, v in constellation.state_dict().items()}, str(wdir / f"constellation{suffix}.safetensors"))
|
| 1198 |
+
save_safetensors({k: v.cpu() for k, v in diag_head.state_dict().items()}, str(wdir / f"diagnostic_head{suffix}.safetensors"))
|
| 1199 |
+
|
| 1200 |
+
# Config + history
|
| 1201 |
+
(base_out / "config.json").write_text(json.dumps(config, indent=2, sort_keys=True), encoding="utf-8")
|
| 1202 |
+
(base_out / "history.json").write_text(json.dumps(history, indent=2, sort_keys=True), encoding="utf-8")
|
| 1203 |
+
|
| 1204 |
+
# CSV history
|
| 1205 |
+
max_len = max(len(history.get("train_loss", [])), len(history.get("train_acc", [])), len(history.get("test_acc", [])))
|
| 1206 |
+
df = pd.DataFrame({
|
| 1207 |
+
"epoch": list(range(1, max_len + 1)),
|
| 1208 |
+
"train_loss": history.get("train_loss", [np.nan]*max_len),
|
| 1209 |
+
"train_acc": history.get("train_acc", [np.nan]*max_len),
|
| 1210 |
+
"test_acc": history.get("test_acc", [np.nan]*max_len),
|
| 1211 |
+
"ce": history.get("ce", [np.nan]*max_len),
|
| 1212 |
+
"dual": history.get("dual", [np.nan]*max_len),
|
| 1213 |
+
"rose": history.get("rose", [np.nan]*max_len),
|
| 1214 |
+
"diag": history.get("diag", [np.nan]*max_len),
|
| 1215 |
+
"reg": history.get("reg", [np.nan]*max_len),
|
| 1216 |
+
"lambda": history.get("lambda", [np.nan]*max_len),
|
| 1217 |
+
})
|
| 1218 |
+
df.to_csv(base_out / "history.csv", index=False)
|
| 1219 |
+
|
| 1220 |
+
# Plots
|
| 1221 |
+
if Path("plots").exists():
|
| 1222 |
+
shutil.copytree("plots", base_out / "plots", dirs_exist_ok=True)
|
| 1223 |
+
|
| 1224 |
+
# TensorBoard
|
| 1225 |
+
tb_dst = base_out / "tensorboard"; tb_dst.mkdir(parents=True, exist_ok=True)
|
| 1226 |
+
if tb_log_dir and Path(tb_log_dir).exists():
|
| 1227 |
+
for p in Path(tb_log_dir).glob("*"):
|
| 1228 |
+
shutil.copy2(p, tb_dst / p.name)
|
| 1229 |
+
_zip_dir(tb_dst, base_out / "tensorboard_events.zip")
|
| 1230 |
+
|
| 1231 |
+
# Manifest + README
|
| 1232 |
+
manifest = {
|
| 1233 |
+
"timestamp": ts,
|
| 1234 |
+
"repo_id": repo_id,
|
| 1235 |
+
"subdirectory": paths["path_in_repo"],
|
| 1236 |
+
"dataset_name": dataset_name,
|
| 1237 |
+
"class_names": class_names,
|
| 1238 |
+
"num_classes": len(class_names),
|
| 1239 |
+
"models": {
|
| 1240 |
+
"encoder": {"params": _param_count(encoder)},
|
| 1241 |
+
"constellation": {"params": _param_count(constellation)},
|
| 1242 |
+
"diagnostic_head": {"params": _param_count(diag_head)},
|
| 1243 |
+
},
|
| 1244 |
+
"results": {
|
| 1245 |
+
"best_test_accuracy": float(best.get("acc", 0.0)),
|
| 1246 |
+
"best_epoch": int(best.get("epoch", -1)),
|
| 1247 |
+
},
|
| 1248 |
+
"environment": {
|
| 1249 |
+
"python": sys.version,
|
| 1250 |
+
"platform": platform.platform(),
|
| 1251 |
+
"torch": torch.__version__,
|
| 1252 |
+
"cuda_available": torch.cuda.is_available(),
|
| 1253 |
+
"cuda_device": (torch.cuda.get_device_name(0) if torch.cuda.is_available() else None),
|
| 1254 |
+
"cpu_count": psutil.cpu_count(logical=True),
|
| 1255 |
+
"memory_gb": round(psutil.virtual_memory().total / (1024**3), 2),
|
| 1256 |
+
},
|
| 1257 |
+
}
|
| 1258 |
+
(base_out / "manifest.json").write_text(json.dumps(manifest, indent=2, sort_keys=True), encoding="utf-8")
|
| 1259 |
+
|
| 1260 |
+
(base_out / "README.md").write_text(
|
| 1261 |
+
f"""# Pentachora Adaptive Encoded — {ts}
|
| 1262 |
+
|
| 1263 |
+
**Dataset:** {dataset_name}
|
| 1264 |
+
|
| 1265 |
+
**Contents**
|
| 1266 |
+
- `weights/*.safetensors` — encoder, constellation, diagnostic head
|
| 1267 |
+
- `config.json`, `manifest.json`
|
| 1268 |
+
- `history.json` / `history.csv`
|
| 1269 |
+
- `tensorboard/` (and `tensorboard_events.zip`)
|
| 1270 |
+
- `plots/` — accuracy, loss, λ, confusion
|
| 1271 |
+
""",
|
| 1272 |
+
encoding="utf-8"
|
| 1273 |
+
)
|
| 1274 |
+
|
| 1275 |
+
# Push
|
| 1276 |
+
print(f"[push] Uploading to hf://{repo_id}/{paths['path_in_repo']}")
|
| 1277 |
+
api.upload_folder(
|
| 1278 |
+
repo_id=repo_id,
|
| 1279 |
+
folder_path=str(base_out),
|
| 1280 |
+
path_in_repo=paths["path_in_repo"],
|
| 1281 |
+
repo_type="model",
|
| 1282 |
+
commit_message=f"[{dataset_name}] {ts} | best_acc={manifest['results']['best_test_accuracy']:.4f}",
|
| 1283 |
+
)
|
| 1284 |
+
print("[push] ✅ Upload complete.")
|
| 1285 |
+
return base_out
|
| 1286 |
+
|
| 1287 |
+
# ---------------------------------------------------------------------
|
| 1288 |
+
# Dataset sweep
|
| 1289 |
+
# ---------------------------------------------------------------------
|
| 1290 |
+
def run_one_dataset(name: str) -> Dict:
|
| 1291 |
+
print("\n" + "="*60)
|
| 1292 |
+
print(f"RUN: {name}")
|
| 1293 |
+
print("="*60)
|
| 1294 |
+
|
| 1295 |
+
# Load
|
| 1296 |
+
train_loader, test_loader, ncls, class_names, in_dim, out_c = get_dataset_single(
|
| 1297 |
+
name, batch_size=config["batch_size"], num_workers=2
|
| 1298 |
+
)
|
| 1299 |
+
cfg_local = dict(config)
|
| 1300 |
+
cfg_local["num_classes"] = ncls
|
| 1301 |
+
cfg_local["input_dim"] = in_dim
|
| 1302 |
+
cfg_local["input_channels"] = out_c
|
| 1303 |
+
|
| 1304 |
+
# TB writer per dataset
|
| 1305 |
+
ts = _timestamp()
|
| 1306 |
+
tb_dir = Path("tb_logs") / f"{_dataset_slug(name)}" / ts
|
| 1307 |
+
tb_dir.mkdir(parents=True, exist_ok=True)
|
| 1308 |
+
writer = SummaryWriter(log_dir=str(tb_dir))
|
| 1309 |
+
|
| 1310 |
+
start = time.time()
|
| 1311 |
+
encoder, constellation, diag_head, history, best = train_one(
|
| 1312 |
+
train_loader, test_loader, ncls, cfg_local, device, writer, class_names
|
| 1313 |
+
)
|
| 1314 |
+
elapsed_min = (time.time() - start) / 60.0
|
| 1315 |
+
|
| 1316 |
+
# Plots
|
| 1317 |
+
plot_history(history, outdir="plots")
|
| 1318 |
+
if best["cm"] is not None:
|
| 1319 |
+
plot_confusion(best["cm"], class_names, outpath=f"plots/best_confusion_{_dataset_slug(name)}_epoch_{best['epoch']}.png")
|
| 1320 |
+
|
| 1321 |
+
# Push artifacts
|
| 1322 |
+
save_and_push_artifacts(
|
| 1323 |
+
encoder=encoder,
|
| 1324 |
+
constellation=constellation,
|
| 1325 |
+
diag_head=diag_head,
|
| 1326 |
+
config=cfg_local,
|
| 1327 |
+
class_names=class_names,
|
| 1328 |
+
history=history,
|
| 1329 |
+
best=best,
|
| 1330 |
+
tb_log_dir=tb_dir,
|
| 1331 |
+
dataset_names=[name],
|
| 1332 |
+
)
|
| 1333 |
+
|
| 1334 |
+
result = {
|
| 1335 |
+
"dataset": name,
|
| 1336 |
+
"classes": ncls,
|
| 1337 |
+
"channels": out_c,
|
| 1338 |
+
"img_size": config.get("img_size", 28),
|
| 1339 |
+
"best_acc": float(best["acc"]),
|
| 1340 |
+
"best_epoch": int(best["epoch"]),
|
| 1341 |
+
"params_encoder": _param_count(encoder),
|
| 1342 |
+
"params_constellation": _param_count(constellation),
|
| 1343 |
+
"elapsed_min": round(elapsed_min, 3),
|
| 1344 |
+
}
|
| 1345 |
+
print(f"[done] {name} -> best_acc={result['best_acc']:.4f} @ epoch {result['best_epoch']} time={result['elapsed_min']:.2f}m")
|
| 1346 |
+
return result
|
| 1347 |
+
|
| 1348 |
+
def run_sweep(datasets: List[str]) -> Dict:
|
| 1349 |
+
os.makedirs("sweeps", exist_ok=True)
|
| 1350 |
+
results = []
|
| 1351 |
+
failures = []
|
| 1352 |
+
for name in datasets:
|
| 1353 |
+
try:
|
| 1354 |
+
results.append(run_one_dataset(name))
|
| 1355 |
+
except Exception as e:
|
| 1356 |
+
print(f"[fail] {name}: {e}")
|
| 1357 |
+
failures.append({"dataset": name, "error": str(e)})
|
| 1358 |
+
|
| 1359 |
+
# Save local sweep summary
|
| 1360 |
+
ts = _timestamp()
|
| 1361 |
+
sweep_dir = Path("sweeps") / ts
|
| 1362 |
+
sweep_dir.mkdir(parents=True, exist_ok=True)
|
| 1363 |
+
|
| 1364 |
+
df = pd.DataFrame(results)
|
| 1365 |
+
df.to_csv(sweep_dir / "results.csv", index=False)
|
| 1366 |
+
(sweep_dir / "results.json").write_text(json.dumps(results, indent=2), encoding="utf-8")
|
| 1367 |
+
(sweep_dir / "failures.json").write_text(json.dumps(failures, indent=2), encoding="utf-8")
|
| 1368 |
+
|
| 1369 |
+
# Push sweep summary
|
| 1370 |
+
repo_id = _resolve_repo_id(config)
|
| 1371 |
+
_hf_login_if_needed()
|
| 1372 |
+
api = _ensure_repo(repo_id)
|
| 1373 |
+
path_in_repo = f"pentachora-adaptive-encoded/_sweep/{ts}"
|
| 1374 |
+
print(f"[push] Uploading sweep summary to hf://{repo_id}/{path_in_repo}")
|
| 1375 |
+
api.upload_folder(repo_id=repo_id, folder_path=str(sweep_dir), path_in_repo=path_in_repo, repo_type="model")
|
| 1376 |
+
print("[push] ✅ Sweep summary uploaded.")
|
| 1377 |
+
|
| 1378 |
+
return {"timestamp": ts, "results": results, "failures": failures, "path_in_repo": path_in_repo}
|
| 1379 |
+
|
| 1380 |
+
# ---------------------------------------------------------------------
|
| 1381 |
+
# Main
|
| 1382 |
+
# ---------------------------------------------------------------------
|
| 1383 |
+
def main():
|
| 1384 |
+
print("\n" + "="*60)
|
| 1385 |
+
print("PENTACHORON CONSTELLATION FINAL CONFIGURATION")
|
| 1386 |
+
print("="*60)
|
| 1387 |
+
for k, v in config.items():
|
| 1388 |
+
print(f"{k:24}: {v}")
|
| 1389 |
+
if config["lr"] > 1e-1:
|
| 1390 |
+
print(f"⚠️ High LR detected ({config['lr']}). If unstable, try 5e-3 to 5e-2.")
|
| 1391 |
+
|
| 1392 |
+
# Sweep mode?
|
| 1393 |
+
if bool(config.get("sweep_all", False)) or os.getenv("RUN_SWEEP", "0") == "1":
|
| 1394 |
+
# Try all TorchVision + MedMNIST (skip those not available)
|
| 1395 |
+
datasets_all = list(TORCHVISION_DATASETS)
|
| 1396 |
+
if medmnist is not None:
|
| 1397 |
+
datasets_all += MEDMNIST_DATASETS
|
| 1398 |
+
out = run_sweep(datasets_all)
|
| 1399 |
+
print(f"\nSweep complete. Summary path: {out['path_in_repo']}")
|
| 1400 |
+
return
|
| 1401 |
+
|
| 1402 |
+
|
| 1403 |
+
|
| 1404 |
+
|
| 1405 |
+
# Single dataset default (edit here as desired)
|
| 1406 |
+
DATASET = config.get("dataset", "FashionMNIST")
|
| 1407 |
+
|
| 1408 |
+
|
| 1409 |
+
|
| 1410 |
+
|
| 1411 |
+
|
| 1412 |
+
train_loader, test_loader, ncls, class_names, in_dim, out_c = get_dataset_single(
|
| 1413 |
+
DATASET, batch_size=config["batch_size"], num_workers=2
|
| 1414 |
+
)
|
| 1415 |
+
config["num_classes"] = ncls
|
| 1416 |
+
config["input_dim"] = in_dim
|
| 1417 |
+
config["input_channels"] = out_c
|
| 1418 |
+
|
| 1419 |
+
tb_dir = Path("tb_logs") / f"{_dataset_slug(DATASET)}" / _timestamp()
|
| 1420 |
+
tb_dir.mkdir(parents=True, exist_ok=True)
|
| 1421 |
+
writer = SummaryWriter(log_dir=str(tb_dir))
|
| 1422 |
+
|
| 1423 |
+
start = time.time()
|
| 1424 |
+
encoder, constellation, diag_head, history, best = train_one(
|
| 1425 |
+
train_loader, test_loader, ncls, config, device, writer, class_names
|
| 1426 |
+
)
|
| 1427 |
+
elapsed = (time.time() - start) / 60.0
|
| 1428 |
+
|
| 1429 |
+
# Plots
|
| 1430 |
+
plot_history(history, outdir="plots")
|
| 1431 |
+
if best["cm"] is not None:
|
| 1432 |
+
plot_confusion(best["cm"], class_names, outpath=f"plots/best_confusion_epoch_{best['epoch']}.png")
|
| 1433 |
+
|
| 1434 |
+
print("\n" + "="*60)
|
| 1435 |
+
print("TRAINING COMPLETE")
|
| 1436 |
+
print("="*60)
|
| 1437 |
+
print(f"Best Test Accuracy : {best['acc']*100:.2f}% @ epoch {best['epoch']}")
|
| 1438 |
+
print(f"Total Training Time: {elapsed:.2f} minutes")
|
| 1439 |
+
|
| 1440 |
+
save_and_push_artifacts(
|
| 1441 |
+
encoder=encoder,
|
| 1442 |
+
constellation=constellation,
|
| 1443 |
+
diag_head=diag_head,
|
| 1444 |
+
config=config,
|
| 1445 |
+
class_names=class_names,
|
| 1446 |
+
history=history,
|
| 1447 |
+
best=best,
|
| 1448 |
+
tb_log_dir=tb_dir,
|
| 1449 |
+
dataset_names=[DATASET],
|
| 1450 |
+
)
|
| 1451 |
+
print("[done] Artifacts uploaded and saved locally.")
|
| 1452 |
+
|
| 1453 |
+
if __name__ == "__main__":
|
| 1454 |
+
main()
|