Done

fad95a5c · caleb.biggs · fad95a5c · fad95a5c
Commit fad95a5c authored 1 year ago by caleb.biggs
--- a/Notebook.ipynb
+++ b/Notebook.ipynb
--- a/main.py
+++ b/main.py
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.cluster import KMeans
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import silhouette_score
+
+
+#Categorizes Pokemon of a given type into the given number of clusters
+def categorize(dataByType, type1, numClusters):
+    steps = [
+        ('scale', MinMaxScaler()),
+        ('cluster', KMeans(n_clusters=numClusters, n_init=10))
+    ]
+    pipe = Pipeline(steps)
+    
+    toDrop = ['Name', 'Type 1', 'Type 2']
+    typeData = dataByType[type1].drop(columns=toDrop)
+    pipe.fit(typeData)
+    predictions = pipe.predict(typeData)
+    
+    return (numClusters, silhouette_score(typeData, predictions), predictions)
+
+
+pd.set_option('display.width', None)
+pd.set_option('display.max_rows', None)
+
+#Read in data, extract a list of types, and partition the data by type
+data = pd.read_csv("Pokemon.csv")
+types = data["Type 1"].unique()
+dataByType = {}
+for pokemon in data.iterrows():
+    if pokemon[1]["Type 1"] not in dataByType:
+        dataByType[pokemon[1]["Type 1"]] = pd.DataFrame([pokemon[1]])
+        continue
+    dataByType[pokemon[1]["Type 1"]] = pd.concat([
+        dataByType[pokemon[1]["Type 1"]], 
+        pd.DataFrame([pokemon[1]])
+    ])
+
+
+#Get the clustering data for each type and print it
+typePredictions = {}
+for type1 in types:
+    bestNum = 0
+    bestScore = 0
+    print(f"{type1}\n-----------")
+    for i in range (2, 15):
+        if i >= len(dataByType[type1]): break
+        output = categorize(dataByType, type1, i)
+        print(f"{output[0]} clusters: {output[1]}")
+        if output[1] > bestScore:
+            bestNum = output[0]
+            bestScore = output[1]
+            typePredictions[type1] = (output[0], output[2])
+    print(f"best number of clusters: {bestNum}\nbest score: {bestScore}\n")
+
+
+#Create dataframes from the best categories from the previous step
+typeClusters = {}
+for type1 in types:
+    typeClusters[type1] = {}
+    for i in range(len(typePredictions[type1][1])):
+        if typePredictions[type1][1][i] not in typeClusters[type1]:
+            #Gotta love Python
+            typeClusters[type1][typePredictions[type1][1][i]] = pd.DataFrame([dataByType[type1].iloc[i]])
+            continue
+        typeClusters[type1][typePredictions[type1][1][i]] = pd.concat([
+            typeClusters[type1][typePredictions[type1][1][i]],
+            pd.DataFrame([dataByType[type1].iloc[i]])
+        ])
+
+
+#Print full details of the best clusters
+for type1 in types:
+    print(f"\n{type1}\n-----")
+    for i in range(len(typeClusters[type1])):
+        print(f"Cluster {i}\n{typeClusters[type1][i]}")
+        print(f"Mean HP: {typeClusters[type1][i].loc[:, 'HP'].mean()}\
+            \nMean Attack: {typeClusters[type1][i].loc[:, 'Attack'].mean()}\
+            \nMean Defense: {typeClusters[type1][i].loc[:, 'Defense'].mean()}\
+            \nMean Sp. Atk: {typeClusters[type1][i].loc[:, 'Sp. Atk'].mean()}\
+            \nMean Sp. Def: {typeClusters[type1][i].loc[:, 'Sp. Def'].mean()}\
+            \nMean Speed: {typeClusters[type1][i].loc[:, 'Speed'].mean()}")
+        print()
\ No newline at end of file