#import modules 

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#adjust the format/ organization of the data 

#multiple files in this folder
data_dir = Path("Freiwald_Tsao_faceviews_AM_data_csv")

#get all csv files in that folder
csv_files = list(data_dir.glob("*.csv"))
print(f"Found {len(csv_files)} files")

#read each file and store in list
dfs = []
for f in csv_files:
    df = pd.read_csv(f)
    df["neuron_id"] = f.stem 
    dfs.append(df)

#combine into one dataframe
combined = pd.concat(dfs, ignore_index=True)

df = combined

#find all time columns
time_cols = [c for c in df.columns if c.startswith('t')]

#keep only the first 400 ms
time_cols_400 = time_cols[:400]

#keep labels 
label_cols_raw = [
    'labels.person',             #identity (target)
    'labels.orientation',        #view (front, profile, etc.)
    'labels.stimID',             #stimulus ID
    'labels.orient_person_combo',
    'site_info.monkey',
    'site_info.region',
    'neuron_id'                 
]

label_cols = [c for c in label_cols_raw if c in df.columns]  #keep only those that exist

df_400 = df[label_cols + time_cols_400]

Found 193 files

#see what data looks like
df_400.head()

#preprocessing steps
# collapse time into a spike count for first 400 ms
df_400 = df_400.copy()  
df_400['spike_count_0_400'] = df_400[time_cols_400].sum(axis=1)

# keep only ID variables + spike count
mini = df_400[['labels.stimID', 'neuron_id', 'labels.person']].copy()
mini['spike_count'] = df_400['spike_count_0_400']

# pivot: 1 row per trial, 1 column per neuron
trial_neuron = mini.pivot_table(
    index='labels.stimID',
    columns='neuron_id',
    values='spike_count',
    fill_value=0
)

# target labels per trial
y = mini.groupby('labels.stimID')['labels.person'].first()

X = trial_neuron.values
y = y.values

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (200, 193)
y shape: (200,)

#split train/test sets for modeling
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=0,
    stratify=y
)

print(X_train.shape, X_test.shape)

(160, 193) (40, 193)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error

# pipeline: scale -> logistic regression
logit_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(
        max_iter=500,
        n_jobs=-1
    ))
])

param_grid = {
    'clf__C': [0.01, 0.1, 1.0, 10.0]
}

# stratified k-fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# grid
logit_grid = GridSearchCV(
    logit_pipe,
    param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

logit_grid.fit(X_train, y_train)

print("Best C:", logit_grid.best_params_['clf__C'])
print("CV accuracy (logit):", logit_grid.best_score_)

# best estimator
logit_best = logit_grid.best_estimator_

# test on test set
y_test_pred_logit = logit_best.predict(X_test)

print("Test accuracy (logit):", accuracy_score(y_test, y_test_pred_logit))
print(classification_report(y_test, y_test_pred_logit, zero_division=0))

# test accuracy + misclassification
logit_test_acc = accuracy_score(y_test, y_test_pred_logit)
test_misclass = 1 - test_acc

print(f"Test accuracy (logit): {test_acc:.3f}")
print(f"Test misclassification rate (logit): {test_misclass:.3f}")

Best C: 1.0
CV accuracy (logit): 0.8125
Test accuracy (logit): 0.775
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       1.00      0.50      0.67         2
           3       1.00      1.00      1.00         1
           4       0.50      0.50      0.50         2
           5       0.50      0.50      0.50         2
           6       1.00      1.00      1.00         1
           7       1.00      0.50      0.67         2
           8       0.33      0.50      0.40         2
           9       0.50      0.50      0.50         2
          10       1.00      1.00      1.00         2
          11       1.00      1.00      1.00         2
          12       0.33      1.00      0.50         1
          13       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         2
          15       0.00      0.00      0.00         1
          16       0.67      1.00      0.80         2
          17       0.00      0.00      0.00         1
          18       1.00      1.00      1.00         1
          19       1.00      1.00      1.00         2
          20       1.00      1.00      1.00         1
          21       1.00      0.50      0.67         2
          22       1.00      1.00      1.00         2
          23       1.00      1.00      1.00         1
          24       1.00      1.00      1.00         1
          25       1.00      1.00      1.00         1

    accuracy                           0.78        40
   macro avg       0.79      0.78      0.77        40
weighted avg       0.81      0.78      0.77        40

Test accuracy (logit): 0.775
Test misclassification rate (logit): 0.225

#confustion matrix of Logistic regression
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

fig, ax = plt.subplots(figsize=(6, 6))
ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred_logit, ax=ax)
ax.set_title("Logistic Regression – Test Confusion Matrix")
plt.tight_layout()
plt.show()

# Random Forest with 5-fold CV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

rf = RandomForestClassifier(random_state=0, n_jobs=-1)

rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20]
}

# GridSearch
rf_grid = GridSearchCV(
    rf,
    rf_param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

# Fit on train set
rf_grid.fit(X_train, y_train)

print("Best RF params:", rf_grid.best_params_)
print("CV accuracy (RF):", rf_grid.best_score_)

# Best estimator
rf_best = rf_grid.best_estimator_

# Predict on test data
y_test_pred_rf = rf_best.predict(X_test)

print("Test accuracy (RF):", accuracy_score(y_test, y_test_pred_rf))
print(classification_report(y_test, y_test_pred_rf, zero_division=0))

# Test accuracy + misclassification
rf_test_acc = accuracy_score(y_test, y_test_pred_rf)
rf_test_misclass = 1 - rf_test_acc

print(f"Test accuracy (RF): {rf_test_acc:.3f}")
print(f"Test misclassification rate (RF): {rf_test_misclass:.3f}")

Best RF params: {'max_depth': 20, 'n_estimators': 100}
CV accuracy (RF): 0.75625
Test accuracy (RF): 0.7
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       1.00      0.50      0.67         2
           3       0.50      1.00      0.67         1
           4       0.50      0.50      0.50         2
           5       0.33      0.50      0.40         2
           6       1.00      1.00      1.00         1
           7       1.00      0.50      0.67         2
           8       0.00      0.00      0.00         2
           9       0.50      0.50      0.50         2
          10       1.00      1.00      1.00         2
          11       1.00      1.00      1.00         2
          12       0.50      1.00      0.67         1
          13       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         2
          15       0.00      0.00      0.00         1
          16       1.00      1.00      1.00         2
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          19       1.00      1.00      1.00         2
          20       0.00      0.00      0.00         1
          21       1.00      0.50      0.67         2
          22       1.00      1.00      1.00         2
          23       1.00      1.00      1.00         1
          24       0.33      1.00      0.50         1
          25       1.00      1.00      1.00         1

    accuracy                           0.70        40
   macro avg       0.67      0.68      0.65        40
weighted avg       0.72      0.70      0.69        40

Test accuracy (RF): 0.700
Test misclassification rate (RF): 0.300

from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

#confusion matrix plot
fig, ax = plt.subplots(figsize=(6, 6))
ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred_rf, ax=ax)
ax.set_title("Random Forest – Test Confusion Matrix")
plt.tight_layout()
plt.show()

import pandas as pd
# Extract cross-validation accuracy values
logit_cv_acc = logit_grid.best_score_
rf_cv_acc = rf_grid.best_score_

results_table = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "CV Accuracy": [logit_cv_acc, rf_cv_acc],
    "Test Accuracy": [logit_test_acc, rf_test_acc]
})

results_table

print(classification_report(y_test, y_test_pred_logit, zero_division=0))
print(classification_report(y_test, y_test_pred_rf, zero_division=0))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       1.00      0.50      0.67         2
           3       1.00      1.00      1.00         1
           4       0.50      0.50      0.50         2
           5       0.50      0.50      0.50         2
           6       1.00      1.00      1.00         1
           7       1.00      0.50      0.67         2
           8       0.33      0.50      0.40         2
           9       0.50      0.50      0.50         2
          10       1.00      1.00      1.00         2
          11       1.00      1.00      1.00         2
          12       0.33      1.00      0.50         1
          13       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         2
          15       0.00      0.00      0.00         1
          16       0.67      1.00      0.80         2
          17       0.00      0.00      0.00         1
          18       1.00      1.00      1.00         1
          19       1.00      1.00      1.00         2
          20       1.00      1.00      1.00         1
          21       1.00      0.50      0.67         2
          22       1.00      1.00      1.00         2
          23       1.00      1.00      1.00         1
          24       1.00      1.00      1.00         1
          25       1.00      1.00      1.00         1

    accuracy                           0.78        40
   macro avg       0.79      0.78      0.77        40
weighted avg       0.81      0.78      0.77        40

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       1.00      0.50      0.67         2
           3       0.50      1.00      0.67         1
           4       0.50      0.50      0.50         2
           5       0.33      0.50      0.40         2
           6       1.00      1.00      1.00         1
           7       1.00      0.50      0.67         2
           8       0.00      0.00      0.00         2
           9       0.50      0.50      0.50         2
          10       1.00      1.00      1.00         2
          11       1.00      1.00      1.00         2
          12       0.50      1.00      0.67         1
          13       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         2
          15       0.00      0.00      0.00         1
          16       1.00      1.00      1.00         2
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          19       1.00      1.00      1.00         2
          20       0.00      0.00      0.00         1
          21       1.00      0.50      0.67         2
          22       1.00      1.00      1.00         2
          23       1.00      1.00      1.00         1
          24       0.33      1.00      0.50         1
          25       1.00      1.00      1.00         1

    accuracy                           0.70        40
   macro avg       0.67      0.68      0.65        40
weighted avg       0.72      0.70      0.69        40

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

#logistic regression confusion matrix
ConfusionMatrixDisplay.from_predictions(
    y_test, y_test_pred_logit,
    ax=axes[0],
    colorbar=False
)
axes[0].set_title("Logistic Regression – Test Confusion Matrix")

#random forest confusion matrix
ConfusionMatrixDisplay.from_predictions(
    y_test, y_test_pred_rf,
    ax=axes[1],
    colorbar=False
)
axes[1].set_title("Random Forest – Test Confusion Matrix")

plt.tight_layout()
plt.show()

from sklearn.decomposition import PCA

pca = PCA(n_components=2)      # Reduce to 2D for plotting
X_pca = pca.fit_transform(X)   

plt.figure(figsize=(8,6))
scatter = plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap='tab20', s=50)
plt.title("Neural Response PCA Embedding")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.colorbar(scatter, label="Identity")
plt.show()

models      = ["Logistic", "Random Forest"]
cv_scores   = [logit_cv_acc, rf_cv_acc]
test_scores = [logit_test_acc, rf_test_acc]

x = np.arange(len(models))
width = 0.35

fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(x - width/2, cv_scores,   width, label="CV accuracy")
ax.bar(x + width/2, test_scores, width, label="Test accuracy")

ax.set_xticks(x)
ax.set_xticklabels(models)
ax.set_ylim(0, 1)
ax.set_ylabel("Accuracy")
ax.set_title("Model Comparison – CV vs Test Accuracy")
ax.legend()
plt.tight_layout()
plt.show()

	labels.person	labels.orientation	labels.stimID	labels.orient_person_combo	site_info.monkey	site_info.region	neuron_id	time.2_3	...	time.400_401
0	1	front	1	front 1	bert	am	raster_data_bert_am_site013	0	...	0
1	1	front	1	front 1	bert	am	raster_data_bert_am_site013	0	...	1
2	1	front	1	front 1	bert	am	raster_data_bert_am_site013	0	...	0
3	2	front	2	front 2	bert	am	raster_data_bert_am_site013	0	...	0
4	2	front	2	front 2	bert	am	raster_data_bert_am_site013	1	...	0

CMSE 381 Final Project¶

Neural Decoding¶

Group members: Mia Dagati, Lindsey Myers¶

Section_002¶

November 26th, 2025¶

Neural Decoding¶

Background and Motivation¶

Research Questions¶

The primary research question for this project was:¶

Methodology¶

Data¶

Other methods used¶

Models for classification¶

How we evaluate each model?¶

Logistic Regression:¶

Random Forest:¶

Results¶

Classification results¶

What did we do?¶

What did we find?¶

How do we interpret this?¶

Discussion and Conclusion¶

Discussion on the classification results¶

Conclusion and future steps¶

Author contribution¶

References¶