ML, NBA Data

Predicting Survival

# Uncomment the following lines to install the required packages
!pip install opendatasets #download the data set
!pip install pandas #data manipulation and analysis

Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: opendatasets in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (0.1.22)
Requirement already satisfied: tqdm in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from opendatasets) (4.66.2)
Requirement already satisfied: kaggle in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from opendatasets) (1.6.6)
Requirement already satisfied: click in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from opendatasets) (8.1.7)
Requirement already satisfied: six>=1.10 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from kaggle->opendatasets) (1.15.0)
Requirement already satisfied: certifi in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (2023.7.22)
Requirement already satisfied: python-dateutil in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (2.8.2)
Requirement already satisfied: requests in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (2.31.0)
Requirement already satisfied: python-slugify in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (8.0.4)
Requirement already satisfied: urllib3 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (2.0.7)
Requirement already satisfied: bleach in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from kaggle->opendatasets) (6.1.0)
Requirement already satisfied: webencodings in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from bleach->kaggle->opendatasets) (0.5.1)
Requirement already satisfied: text-unidecode>=1.3 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from python-slugify->kaggle->opendatasets) (1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from requests->kaggle->opendatasets) (3.3.1)
Requirement already satisfied: idna<4,>=2.5 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from requests->kaggle->opendatasets) (3.4)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: pandas in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (2.2.1)
Requirement already satisfied: numpy<2,>=1.22.4 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from pandas) (2023.4)
Requirement already satisfied: tzdata>=2022.7 in /Users/rayanesouissi/Library/Python/3.9/lib/python/site-packages (from pandas) (2024.1)
Requirement already satisfied: six>=1.5 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.15.0)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m

import pandas as pd

df_cancer = pd.read_csv('breast_cancer.csv', encoding='cp1252', sep=',')

df_cancer

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	Class
0	5	1	1	1	2	1	3	1	1	2
1	5	4	4	5	7	10	3	2	1	2
2	3	1	1	1	2	2	3	1	1	2
3	6	8	8	1	3	4	3	7	1	2
4	4	1	1	3	2	1	3	1	1	2
...	...	...	...	...	...	...	...	...	...	...
678	3	1	1	1	3	2	1	1	1	2
679	2	1	1	1	2	1	1	1	1	2
680	5	10	10	3	7	3	8	10	2	4
681	4	8	6	4	3	4	10	6	1	4
682	4	8	8	5	4	5	10	4	1	4

683 rows × 10 columns

# Select specific columns to clean the data
df_final = df_cancer[['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Normal Nucleoli', 'Mitoses']]

# Display the cleaned DataFrame
df_final

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Single Epithelial Cell Size	Bare Nuclei	Normal Nucleoli	Mitoses
0	5	1	1	2	1	1	1
1	5	4	4	7	10	2	1
2	3	1	1	2	2	1	1
3	6	8	8	3	4	7	1
4	4	1	1	2	1	1	1
...	...	...	...	...	...	...	...
678	3	1	1	3	2	1	1
679	2	1	1	2	1	1	1
680	5	10	10	7	3	10	2
681	4	8	6	3	4	6	1
682	4	8	8	4	5	4	1

683 rows × 7 columns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Assuming df_new is the DataFrame with the features for prediction
x = df_final

# Use the 'Class' column as the target variable and convert it to binary
def replace_score(score):
    return 1 if score == 4 else 0

y = df_cancer['Class'].apply(replace_score)

# Split into testing and training datasets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed for convergence
model.fit(X_train, y_train)

# Test the model and report accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9562043795620438

Predicting Survival

So, now we are ready to play the game… “Would I have survived the Titanic?”.

Insert your own data in the code. Look at your analysis and consider how you would travel today.

Data description:
- pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
- name - Name
- sex - male or female
- age - number of year
- sibsp - number of Siblings/Spouses Aboard
- parch - number of Parents/Children Aboard
- fare - passenger fare 0 to 512
- embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
- alone - boolean True or False

import numpy as np
import pandas as pd

# Define a new patient's data as a DataFrame with the specific features from the breast cancer dataset
new_patient = pd.DataFrame({
    'Clump Thickness': [2],
    'Uniformity of Cell Size': [8],
    'Uniformity of Cell Shape': [1],
    'Single Epithelial Cell Size': [8],
    'Bare Nuclei': [6],
    'Normal Nucleoli': [1],
    'Mitoses': [1]
})

# Use the trained logistic regression model (logreg) to predict the probability
# of the tumor being benign (0) or malignant (1)
probability = logreg.predict_proba(new_patient)

# Extract the probabilities
benign_proba, malignant_proba = probability[0]

# Print the predicted probabilities for the tumor being benign and malignant
print('Benign probability: {:.2%}'.format(benign_proba))
print('Malignant probability: {:.2%}'.format(malignant_proba))

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb Cell 7 line 1
      <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=4'>5</a> new_patient = pd.DataFrame({
      <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=5'>6</a>     'Clump Thickness': [2],
      <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=6'>7</a>     'Uniformity of Cell Size': [8],
   (...)
     <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=11'>12</a>     'Mitoses': [1]
     <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=12'>13</a> })
     <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=14'>15</a> # Use the trained logistic regression model (logreg) to predict the probability
     <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=15'>16</a> # of the tumor being benign (0) or malignant (1)
---> <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=16'>17</a> probability = logreg.predict_proba(new_patient)
     <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=18'>19</a> # Extract the probabilities
     <a href='vscode-notebook-cell:/Users/rayanesouissi/student02/_notebooks/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#W6sZmlsZQ%3D%3D?line=19'>20</a> benign_proba, malignant_proba = probability[0]


NameError: name 'logreg' is not defined

from joblib import dump, load

#Save file using joblib
dump(model, 'cancer_model.joblib') 

['cancer_model.joblib']

model = load('cancer_model.joblib')

import pandas as pd
import numpy as np

from flask_restful import Api, Resource
from flask import Blueprint, request
from joblib import load

#Load the ML model: replace the file name with whatever yours is
model = load('./api/cancer_model.joblib')

#Initialize Flask API endpoint blueprint
cancer_api = Blueprint('cancer_api', __name__, url_prefix='/api/cancer')
api = Api(cancer_api)

#Use a post request to take data from the frontend as a JSON, then return output value
class cancerAPI:
    class _Predict(Resource):
        def post(self):
            #Get data from frontend
            body = request.get_json()
            
            if body is not None:
                #Convert frontend JSON output to a pandas dataframe
                data = pd.DataFrame([body])
                
                #Predict and return the cancer score (model.predict returns a 1-element array so we need to take a slice)
                score = model.predict(data)[0]
                return {'score': score}, 200
            else:
                return {'message': 'No data provided'}, 400
    
    #Add endpoint resource for this method
    api.add_resource(_Predict, '/predict')

---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb Cell 10 line 9
      <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=5'>6</a> from joblib import load
      <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=7'>8</a> #Load the ML model: replace the file name with whatever yours is
----> <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=8'>9</a> model = load('./api/cancer_model.joblib')
     <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=10'>11</a> #Initialize Flask API endpoint blueprint
     <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X15sZmlsZQ%3D%3D?line=11'>12</a> cancer_api = Blueprint('cancer_api', __name__, url_prefix='/api/cancer')


File ~/Library/Python/3.9/lib/python/site-packages/joblib/numpy_pickle.py:650, in load(filename, mmap_mode)
    648         obj = _unpickle(fobj)
    649 else:
--> 650     with open(filename, 'rb') as f:
    651         with _read_fileobject(f, filename, mmap_mode) as fobj:
    652             if isinstance(fobj, str):
    653                 # if the returned file object is a string, this means we
    654                 # try to load a pickle file generated with an version of
    655                 # Joblib so we load it with joblib compatibility function.


FileNotFoundError: [Errno 2] No such file or directory: './api/cancer_model.joblib'

from __init__ import app
from api.cancerr import cancer_api
from flask import Flask
from flask_cors import CORS

#Enable CORS for everything
app = Flask(__name__)
CORS(app)

app.register_blueprint(cancer_api)

#Allow all CORS headers before requests
@app.before_request
def before_request():
    allowed_origin = request.headers.get('Origin')
    if allowed_origin:
        cors._origins = "*"

---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb Cell 11 line 1
----> <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X20sZmlsZQ%3D%3D?line=0'>1</a> from __init__ import app
      <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X20sZmlsZQ%3D%3D?line=1'>2</a> from api.happy import happiness_api
      <a href='vscode-notebook-cell:/Users/rayanesouissi/tri3_machine_learning/_notebooks/.ipynb_checkpoints/2024-03-05-DS-pythossn-pandas-df_cancer.ipynb#X20sZmlsZQ%3D%3D?line=2'>3</a> from flask import Flask


ModuleNotFoundError: No module named '__init__'

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	Class
0	5	1	1	1	2	1	3	1	1	2
1	5	4	4	5	7	10	3	2	1	2
2	3	1	1	1	2	2	3	1	1	2
3	6	8	8	1	3	4	3	7	1	2
4	4	1	1	3	2	1	3	1	1	2
...	...	...	...	...	...	...	...	...	...	...
678	3	1	1	1	3	2	1	1	1	2
679	2	1	1	1	2	1	1	1	1	2
680	5	10	10	3	7	3	8	10	2	4
681	4	8	6	4	3	4	10	6	1	4
682	4	8	8	5	4	5	10	4	1	4

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Single Epithelial Cell Size	Bare Nuclei	Normal Nucleoli	Mitoses
0	5	1	1	2	1	1	1
1	5	4	4	7	10	2	1
2	3	1	1	2	2	1	1
3	6	8	8	3	4	7	1
4	4	1	1	2	1	1	1
...	...	...	...	...	...	...	...
678	3	1	1	3	2	1	1
679	2	1	1	2	1	1	1
680	5	10	10	7	3	10	2
681	4	8	6	3	4	6	1
682	4	8	8	4	5	4	1

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	Class
0	5	1	1	1	2	1	3	1	1	2
1	5	4	4	5	7	10	3	2	1	2
2	3	1	1	1	2	2	3	1	1	2
3	6	8	8	1	3	4	3	7	1	2
4	4	1	1	3	2	1	3	1	1	2
...	...	...	...	...	...	...	...	...	...	...
678	3	1	1	1	3	2	1	1	1	2
679	2	1	1	1	2	1	1	1	1	2
680	5	10	10	3	7	3	8	10	2	4
681	4	8	6	4	3	4	10	6	1	4
682	4	8	8	5	4	5	10	4	1	4

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Single Epithelial Cell Size	Bare Nuclei	Normal Nucleoli	Mitoses
0	5	1	1	2	1	1	1
1	5	4	4	7	10	2	1
2	3	1	1	2	2	1	1
3	6	8	8	3	4	7	1
4	4	1	1	2	1	1	1
...	...	...	...	...	...	...	...
678	3	1	1	3	2	1	1
679	2	1	1	2	1	1	1
680	5	10	10	7	3	10	2
681	4	8	6	3	4	6	1
682	4	8	8	4	5	4	1

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	Class
0	5	1	1	1	2	1	3	1	1	2
1	5	4	4	5	7	10	3	2	1	2
2	3	1	1	1	2	2	3	1	1	2
3	6	8	8	1	3	4	3	7	1	2
4	4	1	1	3	2	1	3	1	1	2
...	...	...	...	...	...	...	...	...	...	...
678	3	1	1	1	3	2	1	1	1	2
679	2	1	1	1	2	1	1	1	1	2
680	5	10	10	3	7	3	8	10	2	4
681	4	8	6	4	3	4	10	6	1	4
682	4	8	8	5	4	5	10	4	1	4

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Single Epithelial Cell Size	Bare Nuclei	Normal Nucleoli	Mitoses
0	5	1	1	2	1	1	1
1	5	4	4	7	10	2	1
2	3	1	1	2	2	1	1
3	6	8	8	3	4	7	1
4	4	1	1	2	1	1	1
...	...	...	...	...	...	...	...
678	3	1	1	3	2	1	1
679	2	1	1	2	1	1	1
680	5	10	10	7	3	10	2
681	4	8	6	3	4	6	1
682	4	8	8	4	5	4	1