ML, NBA Data
None
# Uncomment the following lines to install the required packages
!pip install opendatasets #download the data set
!pip install pandas #data manipulation and analysis
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: opendatasets in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (0.1.22)
Requirement already satisfied: tqdm in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from opendatasets) (4.66.2)
Requirement already satisfied: kaggle in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from opendatasets) (1.6.6)
Requirement already satisfied: click in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from opendatasets) (8.1.7)
Requirement already satisfied: six>=1.10 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from kaggle->opendatasets) (1.15.0)
Requirement already satisfied: certifi in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (2023.7.22)
Requirement already satisfied: python-dateutil in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (2.8.2)
Requirement already satisfied: requests in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (2.31.0)
Requirement already satisfied: python-slugify in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (8.0.4)
Requirement already satisfied: urllib3 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (2.0.7)
Requirement already satisfied: bleach in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (6.1.0)
Requirement already satisfied: webencodings in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from bleach->kaggle->opendatasets) (0.5.1)
Requirement already satisfied: text-unidecode>=1.3 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from python-slugify->kaggle->opendatasets) (1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from requests->kaggle->opendatasets) (3.3.1)
Requirement already satisfied: idna<4,>=2.5 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from requests->kaggle->opendatasets) (3.4)
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: pandas in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (2.2.1)
Requirement already satisfied: numpy<2,>=1.22.4 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from pandas) (2023.4)
Requirement already satisfied: tzdata>=2022.7 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from pandas) (2024.1)
Requirement already satisfied: six>=1.5 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.15.0)
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
import pandas as pd
df_cancer = pd.read_csv('breast_cancer.csv', encoding='cp1252', sep=',')
df_cancer
Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 1 | 1 | 1 | 2 | 1 | 3 | 1 | 1 | 2 |
1 | 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 | 2 |
2 | 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 | 2 |
3 | 6 | 8 | 8 | 1 | 3 | 4 | 3 | 7 | 1 | 2 |
4 | 4 | 1 | 1 | 3 | 2 | 1 | 3 | 1 | 1 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
678 | 3 | 1 | 1 | 1 | 3 | 2 | 1 | 1 | 1 | 2 |
679 | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 2 |
680 | 5 | 10 | 10 | 3 | 7 | 3 | 8 | 10 | 2 | 4 |
681 | 4 | 8 | 6 | 4 | 3 | 4 | 10 | 6 | 1 | 4 |
682 | 4 | 8 | 8 | 5 | 4 | 5 | 10 | 4 | 1 | 4 |
683 rows × 10 columns
# Select specific columns to clean the data
df_final = df_cancer[['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Normal Nucleoli', 'Mitoses']]
# Display the cleaned DataFrame
df_final
Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Single Epithelial Cell Size | Bare Nuclei | Normal Nucleoli | Mitoses | |
---|---|---|---|---|---|---|---|
0 | 5 | 1 | 1 | 2 | 1 | 1 | 1 |
1 | 5 | 4 | 4 | 7 | 10 | 2 | 1 |
2 | 3 | 1 | 1 | 2 | 2 | 1 | 1 |
3 | 6 | 8 | 8 | 3 | 4 | 7 | 1 |
4 | 4 | 1 | 1 | 2 | 1 | 1 | 1 |
... | ... | ... | ... | ... | ... | ... | ... |
678 | 3 | 1 | 1 | 3 | 2 | 1 | 1 |
679 | 2 | 1 | 1 | 2 | 1 | 1 | 1 |
680 | 5 | 10 | 10 | 7 | 3 | 10 | 2 |
681 | 4 | 8 | 6 | 3 | 4 | 6 | 1 |
682 | 4 | 8 | 8 | 4 | 5 | 4 | 1 |
683 rows × 7 columns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# Assuming df_new is the DataFrame with the features for prediction
x = df_final
# Use the 'Class' column as the target variable and convert it to binary
def replace_score(score):
return 1 if score == 4 else 0
y = df_cancer['Class'].apply(replace_score)
# Split into testing and training datasets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# Train a logistic regression model
model = LogisticRegression(max_iter=1000) # Increase max_iter if needed for convergence
model.fit(X_train, y_train)
# Test the model and report accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
Accuracy: 0.9562043795620438
Predicting Survival
So, now we are ready to play the game… “Would I have survived the Titanic?”.
Insert your own data in the code. Look at your analysis and consider how you would travel today.
- Data description:
- pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
- name - Name
- sex - male or female
- age - number of year
- sibsp - number of Siblings/Spouses Aboard
- parch - number of Parents/Children Aboard
- fare - passenger fare 0 to 512
- embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
- alone - boolean True or False
import numpy as np
import pandas as pd
# Define a new patient's data as a DataFrame with the specific features from the breast cancer dataset
new_patient = pd.DataFrame({
'Clump Thickness': [2],
'Uniformity of Cell Size': [8],
'Uniformity of Cell Shape': [1],
'Single Epithelial Cell Size': [8],
'Bare Nuclei': [6],
'Normal Nucleoli': [1],
'Mitoses': [1]
})
# Use the trained logistic regression model (logreg) to predict the probability
# of the tumor being benign (0) or malignant (1)
probability = logreg.predict_proba(new_patient)
# Extract the probabilities
benign_proba, malignant_proba = probability[0]
# Print the predicted probabilities for the tumor being benign and malignant
print('Benign probability: {:.2%}'.format(benign_proba))
print('Malignant probability: {:.2%}'.format(malignant_proba))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb Cell 7 line 1
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=4'>5</a> new_patient = pd.DataFrame({
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=5'>6</a> 'Clump Thickness': [2],
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=6'>7</a> 'Uniformity of Cell Size': [8],
(...)
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=11'>12</a> 'Mitoses': [1]
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=12'>13</a> })
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=14'>15</a> # Use the trained logistic regression model (logreg) to predict the probability
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=15'>16</a> # of the tumor being benign (0) or malignant (1)
---> <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=16'>17</a> probability = logreg.predict_proba(new_patient)
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=18'>19</a> # Extract the probabilities
<a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=19'>20</a> benign_proba, malignant_proba = probability[0]
NameError: name 'logreg' is not defined
from joblib import dump, load
#Save file using joblib
dump(model, 'cancer_model.joblib')
['cancer_model.joblib']
model = load('cancer_model.joblib')
import pandas as pd
import numpy as np
from flask_restful import Api, Resource
from flask import Blueprint, request
from joblib import load
#Load the ML model: replace the file name with whatever yours is
model = load('./api/cancer_model.joblib')
#Initialize Flask API endpoint blueprint
cancer_api = Blueprint('cancer_api', __name__, url_prefix='/api/cancer')
api = Api(cancer_api)
#Use a post request to take data from the frontend as a JSON, then return output value
class cancerAPI:
class _Predict(Resource):
def post(self):
#Get data from frontend
body = request.get_json()
if body is not None:
#Convert frontend JSON output to a pandas dataframe
data = pd.DataFrame([body])
#Predict and return the cancer score (model.predict returns a 1-element array so we need to take a slice)
score = model.predict(data)[0]
return {'score': score}, 200
else:
return {'message': 'No data provided'}, 400
#Add endpoint resource for this method
api.add_resource(_Predict, '/predict')
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb Cell 10 line 9
<a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=5'>6</a> from joblib import load
<a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=7'>8</a> #Load the ML model: replace the file name with whatever yours is
----> <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=8'>9</a> model = load('./api/cancer_model.joblib')
<a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=10'>11</a> #Initialize Flask API endpoint blueprint
<a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=11'>12</a> cancer_api = Blueprint('cancer_api', __name__, url_prefix='/api/cancer')
File ~/Library/Python/3.9/lib/python/site-packages/joblib/numpy_pickle.py:650, in load(filename, mmap_mode)
648 obj = _unpickle(fobj)
649 else:
--> 650 with open(filename, 'rb') as f:
651 with _read_fileobject(f, filename, mmap_mode) as fobj:
652 if isinstance(fobj, str):
653 # if the returned file object is a string, this means we
654 # try to load a pickle file generated with an version of
655 # Joblib so we load it with joblib compatibility function.
FileNotFoundError: [Errno 2] No such file or directory: './api/cancer_model.joblib'
from __init__ import app
from api.cancerr import cancer_api
from flask import Flask
from flask_cors import CORS
#Enable CORS for everything
app = Flask(__name__)
CORS(app)
app.register_blueprint(cancer_api)
#Allow all CORS headers before requests
@app.before_request
def before_request():
allowed_origin = request.headers.get('Origin')
if allowed_origin:
cors._origins = "*"
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb Cell 11 line 1
----> <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X20sZmlsZQ%3D%3D?line=0'>1</a> from __init__ import app
<a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X20sZmlsZQ%3D%3D?line=1'>2</a> from api.happy import happiness_api
<a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X20sZmlsZQ%3D%3D?line=2'>3</a> from flask import Flask
ModuleNotFoundError: No module named '__init__'