Spaces:
Sleeping
Sleeping
Upload chest-xray-anomaly-ml-model-deployment (4).ipynb (#1)
Browse files- Upload chest-xray-anomaly-ml-model-deployment (4).ipynb (32f5727db5050174aa7ff637dc8b65e862582387)
chest-xray-anomaly-ml-model-deployment (4).ipynb
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.6.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":7773,"sourceType":"datasetVersion","datasetId":4667}],"dockerImageVersionId":29271,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## Introduction\nGreetings from the Kaggle bot! This is an automatically-generated kernel with starter code demonstrating how to read in the data and begin exploring. Click the blue \"Edit Notebook\" or \"Fork Notebook\" button at the top of this kernel to begin editing.","metadata":{}},{"cell_type":"markdown","source":"## Exploratory Analysis\nTo begin this exploratory analysis, first use `matplotlib` to import libraries and define functions for plotting the data. Depending on the data, not all plots will be made. (Hey, I'm just a kerneling bot, not a Kaggle Competitions Grandmaster!)","metadata":{}},{"cell_type":"code","source":"from mpl_toolkits.mplot3d import Axes3D\nfrom sklearn.preprocessing import StandardScaler\nimport matplotlib.pyplot as plt # plotting\nimport numpy as np # linear algebra\nimport os # accessing directory structure\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","metadata":{"collapsed":false,"_kg_hide-input":false,"jupyter":{"outputs_hidden":false},"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:44:19.337973Z","iopub.execute_input":"2025-07-20T04:44:19.338253Z","iopub.status.idle":"2025-07-20T04:44:19.342395Z","shell.execute_reply.started":"2025-07-20T04:44:19.338213Z","shell.execute_reply":"2025-07-20T04:44:19.341611Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"There is 1 csv file in the current version of the dataset:\n","metadata":{}},{"cell_type":"code","source":"for dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n","metadata":{"collapsed":false,"_kg_hide-input":false,"jupyter":{"outputs_hidden":false},"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:44:24.789217Z","iopub.execute_input":"2025-07-20T04:44:24.789485Z","iopub.status.idle":"2025-07-20T04:44:30.604505Z","shell.execute_reply.started":"2025-07-20T04:44:24.789447Z","shell.execute_reply":"2025-07-20T04:44:30.603230Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"The next hidden code cells define functions for plotting data. Click on the \"Code\" button in the published kernel to reveal the hidden code.","metadata":{}},{"cell_type":"code","source":"# Distribution graphs (histogram/bar graph) of column data\ndef plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):\n nunique = df.nunique()\n df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values\n nRow, nCol = df.shape\n columnNames = list(df)\n nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow\n plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')\n for i in range(min(nCol, nGraphShown)):\n plt.subplot(nGraphRow, nGraphPerRow, i + 1)\n columnDf = df.iloc[:, i]\n if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):\n valueCounts = columnDf.value_counts()\n valueCounts.plot.bar()\n else:\n columnDf.hist()\n plt.ylabel('counts')\n plt.xticks(rotation = 90)\n plt.title(f'{columnNames[i]} (column {i})')\n plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)\n plt.show()\n","metadata":{"_kg_hide-input":true,"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:44:46.890914Z","iopub.execute_input":"2025-07-20T04:44:46.891196Z","iopub.status.idle":"2025-07-20T04:44:46.899279Z","shell.execute_reply.started":"2025-07-20T04:44:46.891156Z","shell.execute_reply":"2025-07-20T04:44:46.898506Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Correlation matrix\ndef plotCorrelationMatrix(df, graphWidth):\n filename = df.dataframeName\n df = df.dropna('columns') # drop columns with NaN\n df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values\n if df.shape[1] < 2:\n print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')\n return\n corr = df.corr()\n plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')\n corrMat = plt.matshow(corr, fignum = 1)\n plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)\n plt.yticks(range(len(corr.columns)), corr.columns)\n plt.gca().xaxis.tick_bottom()\n plt.colorbar(corrMat)\n plt.title(f'Correlation Matrix for {filename}', fontsize=15)\n plt.show()\n","metadata":{"_kg_hide-input":true,"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:44:53.280389Z","iopub.execute_input":"2025-07-20T04:44:53.280712Z","iopub.status.idle":"2025-07-20T04:44:53.288104Z","shell.execute_reply.started":"2025-07-20T04:44:53.280657Z","shell.execute_reply":"2025-07-20T04:44:53.287092Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Scatter and density plots\ndef plotScatterMatrix(df, plotSize, textSize):\n df = df.select_dtypes(include =[np.number]) # keep only numerical columns\n # Remove rows and columns that would lead to df being singular\n df = df.dropna('columns')\n df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values\n columnNames = list(df)\n if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots\n columnNames = columnNames[:10]\n df = df[columnNames]\n ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')\n corrs = df.corr().values\n for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):\n ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)\n plt.suptitle('Scatter and Density Plot')\n plt.show()\n","metadata":{"_kg_hide-input":true,"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:44:55.709785Z","iopub.execute_input":"2025-07-20T04:44:55.710067Z","iopub.status.idle":"2025-07-20T04:44:55.717176Z","shell.execute_reply.started":"2025-07-20T04:44:55.710020Z","shell.execute_reply":"2025-07-20T04:44:55.716409Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"Now you're ready to read in the data and use the plotting functions to visualize the data.","metadata":{}},{"cell_type":"markdown","source":"### Let's check 1st file: /kaggle/input/sample_labels.csv","metadata":{}},{"cell_type":"code","source":"nRowsRead = 1000 # specify 'None' if want to read whole file\ndf1 = pd.read_csv('/kaggle/input/sample_labels.csv', delimiter=',', nrows = nRowsRead)\ndf1.dataframeName = 'sample_labels.csv'\nnRow, nCol = df1.shape\nprint(f'There are {nRow} rows and {nCol} columns')","metadata":{"collapsed":false,"_kg_hide-input":false,"jupyter":{"outputs_hidden":false},"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:44:58.665586Z","iopub.execute_input":"2025-07-20T04:44:58.665888Z","iopub.status.idle":"2025-07-20T04:44:58.681756Z","shell.execute_reply.started":"2025-07-20T04:44:58.665845Z","shell.execute_reply":"2025-07-20T04:44:58.680821Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"Let's take a quick look at what the data looks like:","metadata":{}},{"cell_type":"code","source":"df1.head(5)","metadata":{"collapsed":false,"_kg_hide-input":false,"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"Distribution graphs (histogram/bar graph) of sampled columns:","metadata":{}},{"cell_type":"code","source":"plotPerColumnDistribution(df1, 10, 5)","metadata":{"collapsed":false,"_kg_hide-input":false,"jupyter":{"outputs_hidden":false},"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:45:05.568093Z","iopub.execute_input":"2025-07-20T04:45:05.568364Z","iopub.status.idle":"2025-07-20T04:45:06.655415Z","shell.execute_reply.started":"2025-07-20T04:45:05.568324Z","shell.execute_reply":"2025-07-20T04:45:06.654573Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"Correlation matrix:","metadata":{}},{"cell_type":"code","source":"plotCorrelationMatrix(df1, 8)","metadata":{"collapsed":false,"_kg_hide-input":false,"jupyter":{"outputs_hidden":false},"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:45:10.105113Z","iopub.execute_input":"2025-07-20T04:45:10.105381Z","iopub.status.idle":"2025-07-20T04:45:10.515463Z","shell.execute_reply.started":"2025-07-20T04:45:10.105347Z","shell.execute_reply":"2025-07-20T04:45:10.514366Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"Scatter and density plots:","metadata":{}},{"cell_type":"code","source":"plotScatterMatrix(df1, 18, 10)","metadata":{"collapsed":false,"_kg_hide-input":false,"jupyter":{"outputs_hidden":false},"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:45:14.975518Z","iopub.execute_input":"2025-07-20T04:45:14.975806Z","iopub.status.idle":"2025-07-20T04:45:17.495421Z","shell.execute_reply.started":"2025-07-20T04:45:14.975765Z","shell.execute_reply":"2025-07-20T04:45:17.494249Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Build model\nmodel = Sequential([\n Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),\n MaxPooling2D((2, 2)),\n Conv2D(64, (3, 3), activation='relu'),\n MaxPooling2D((2, 2)),\n Conv2D(128, (3, 3), activation='relu'),\n MaxPooling2D((2, 2)),\n Flatten(),\n Dense(128, activation='relu'),\n Dropout(0.5),\n Dense(14, activation='sigmoid') # Sigmoid for multi-label\n])\n\nmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\nmodel.summary()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:45:32.163314Z","iopub.execute_input":"2025-07-20T04:45:32.163652Z","iopub.status.idle":"2025-07-20T04:45:32.359225Z","shell.execute_reply.started":"2025-07-20T04:45:32.163594Z","shell.execute_reply":"2025-07-20T04:45:32.358405Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport tensorflow as tf\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout\nimport cv2\nimport os\nfrom sklearn.model_selection import train_test_split\n\nimport matplotlib.pyplot as plt\n\n# Load dataset\ndata_path = \"/kaggle/input/sample/sample_labels.csv\"\ndf = pd.read_csv(data_path)\nprint(\"Column names:\", df.columns.tolist())\nprint(\"First few rows:\\n\", df.head())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:45:25.756179Z","iopub.execute_input":"2025-07-20T04:45:25.756438Z","iopub.status.idle":"2025-07-20T04:45:27.299143Z","shell.execute_reply.started":"2025-07-20T04:45:25.756400Z","shell.execute_reply":"2025-07-20T04:45:27.298230Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Preprocess labels\nlabel_column = 'Finding Labels'\nif label_column not in df.columns:\n raise KeyError(f\"Column '{label_column}' not found. Available columns: {df.columns.tolist()}\")\n\n# Get unique labels (excluding 'No Finding')\nall_labels = sorted(set(label for sublist in df[label_column].str.split('|') for label in sublist if label != 'No Finding'))\nprint(\"Labels:\", all_labels)\n\n# Create binary columns for each label\nfor label in all_labels:\n df[label] = df[label_column].apply(lambda x: 1 if label in x.split('|') else 0)\n\n# Prepare image paths\nimage_dir = \"/kaggle/input/sample/images\" # Corrected path\ndf['Image Path'] = df['Image Index'].apply(lambda x: os.path.join(image_dir, x))\n\n# Debug: Check image directory and files\nprint(f\"Image directory: {image_dir}\")\nif os.path.exists(image_dir):\n print(f\"Sample files: {os.listdir(image_dir)[:5]}\")\nelse:\n raise FileNotFoundError(f\"Image directory {image_dir} does not exist.\")\n\n# Check valid image paths\nvalid_paths = df['Image Path'].apply(os.path.exists)\nprint(f\"Total images: {len(df)}, Valid paths: {sum(valid_paths)}\")\nif sum(valid_paths) == 0:\n raise FileNotFoundError(f\"No valid images in {image_dir}. Check file names.\")\n\n# Filter valid paths\ndf = df[valid_paths]\nprint(f\"Valid images after filtering: {len(df)}\")\n\n# Split data\ntrain_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)\nval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\nprint(f\"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}\")\n\n# Data generator\ndef create_data_generator(df, batch_size=32):\n datagen = ImageDataGenerator(\n rescale=1./255,\n rotation_range=10,\n width_shift_range=0.1,\n height_shift_range=0.1,\n zoom_range=0.1,\n horizontal_flip=True\n )\n generator = datagen.flow_from_dataframe(\n dataframe=df,\n x_col='Image Path',\n y_col=all_labels,\n target_size=(224, 224),\n color_mode='grayscale',\n class_mode='raw',\n batch_size=batch_size,\n shuffle=True\n )\n return generator\n\ntrain_generator = create_data_generator(train_df)\nval_generator = create_data_generator(val_df)\ntest_generator = create_data_generator(test_df)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:45:39.346290Z","iopub.execute_input":"2025-07-20T04:45:39.346611Z","iopub.status.idle":"2025-07-20T04:45:44.987763Z","shell.execute_reply.started":"2025-07-20T04:45:39.346528Z","shell.execute_reply":"2025-07-20T04:45:44.986911Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from tensorflow.keras.callbacks import EarlyStopping\n\n# Early stopping\nearly_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)\n\n# Train model\nhistory = model.fit(\n train_generator,\n validation_data=val_generator,\n epochs=10,\n callbacks=[early_stopping]\n)\n\n# Plot training history\nimport matplotlib.pyplot as plt\nplt.plot(history.history['loss'], label='Training Loss')\nplt.plot(history.history['val_loss'], label='Validation Loss')\nplt.xlabel('Epoch')\nplt.ylabel('Loss')\nplt.legend()\nplt.show()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:45:50.685580Z","iopub.execute_input":"2025-07-20T04:45:50.685900Z","iopub.status.idle":"2025-07-20T04:54:44.390578Z","shell.execute_reply.started":"2025-07-20T04:45:50.685841Z","shell.execute_reply":"2025-07-20T04:54:44.389387Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"#code for model evaluation\nfrom sklearn.metrics import classification_report\n\n# Evaluate model\ntest_loss, test_accuracy = model.evaluate(test_generator)\nprint(f\"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}\")\n\n# Get predictions\ntest_generator.reset()\ny_pred = model.predict(test_generator)\ny_pred_binary = (y_pred > 0.5).astype(int)\ny_true = test_df[all_labels].values\n\n# Classification report\nprint(\"Classification Report:\")\nprint(classification_report(y_true, y_pred_binary, target_names=all_labels))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T04:54:52.922662Z","iopub.execute_input":"2025-07-20T04:54:52.922954Z","iopub.status.idle":"2025-07-20T04:55:37.569126Z","shell.execute_reply.started":"2025-07-20T04:54:52.922914Z","shell.execute_reply":"2025-07-20T04:55:37.568351Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from IPython.display import FileLink\n\n# Replace 'your_model.h5' with your actual filename\nFileLink('chest_xray_weights.h5')\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T05:02:22.964710Z","iopub.execute_input":"2025-07-20T05:02:22.965031Z","iopub.status.idle":"2025-07-20T05:02:22.970469Z","shell.execute_reply.started":"2025-07-20T05:02:22.964976Z","shell.execute_reply":"2025-07-20T05:02:22.969486Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Replace 'model' with your actual model variable name if different\nmodel.save('chest_xray_weights.h5') # Saves the full model (architecture + weights)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T05:02:11.841778Z","iopub.execute_input":"2025-07-20T05:02:11.842081Z","iopub.status.idle":"2025-07-20T05:02:12.105444Z","shell.execute_reply.started":"2025-07-20T05:02:11.842034Z","shell.execute_reply":"2025-07-20T05:02:12.104520Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"model.save_weights('chest_xray_weights.h5')\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T05:00:02.556693Z","iopub.execute_input":"2025-07-20T05:00:02.557009Z","iopub.status.idle":"2025-07-20T05:00:02.609891Z","shell.execute_reply.started":"2025-07-20T05:00:02.556963Z","shell.execute_reply":"2025-07-20T05:00:02.609116Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"cd /kaggle/working/","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T05:00:20.332617Z","iopub.execute_input":"2025-07-20T05:00:20.332921Z","iopub.status.idle":"2025-07-20T05:00:20.338113Z","shell.execute_reply.started":"2025-07-20T05:00:20.332877Z","shell.execute_reply":"2025-07-20T05:00:20.337316Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"ls","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-20T05:00:25.301956Z","iopub.execute_input":"2025-07-20T05:00:25.302248Z","iopub.status.idle":"2025-07-20T05:00:26.369639Z","shell.execute_reply.started":"2025-07-20T05:00:25.302208Z","shell.execute_reply":"2025-07-20T05:00:26.368700Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}
|