save results

Junang-Wang · Mar 21, 2024 · b8d147e · b8d147e
2 parents caa2cbe + cd06c96
commit b8d147e
Show file tree

Hide file tree

Showing 23 changed files with 1,448 additions and 228 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -9,7 +9,8 @@
             "type": "python",
             "request": "launch",
             "module": "enter-your-module-name",
-            "justMyCode": true
+            "justMyCode": true,
+            "env": {"PL_TORCH_DISTRIBUTED_BACKEND":"gloo"}
         }
     ]
 }
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "cSpell.words": [
+        "denorm"
+    ]
+}
diff --git a/Modeling eMNS/Generative_model_ETH_v0.ipynb b/Modeling eMNS/Generative_model_ETH_v0.ipynb
@@ -103,10 +103,16 @@
     "print(min_Bfield.shape)\n",
     "print(max_Bfield.shape)\n",
     "\n",
-    "torch.save(min_current, \"./normalize_data/cnn_min_current_ETH.pt\")\n",
-    "torch.save(max_current, \"./normalize_data/cnn_max_current_ETH.pt\")\n",
-    "torch.save(min_Bfield, \"./normalize_data/cnn_min_Bfield_ETH.pt\")\n",
-    "torch.save(max_Bfield, \"./normalize_data/cnn_max_Bfield_ETH.pt\")"
+
+    "print(minB.shape)\n",
+    "print(maxB.shape)\n",
+    "current_norm_max, index = torch.max(Bfield_norm.transpose(0,1).reshape(3,-1), dim=1, keepdim=True)\n",
+    "print(current_norm_max)\n",
+    "# torch.save(min_current, \"./normalize_data/cnn_min_current_ETH.pt\")\n",
+    "# torch.save(max_current, \"./normalize_data/cnn_max_current_ETH.pt\")\n",
+    "# torch.save(min_Bfield, \"./normalize_data/cnn_min_Bfield_ETH.pt\")\n",
+    "# torch.save(max_Bfield, \"./normalize_data/cnn_max_Bfield_ETH.pt\")"
+
    ]
   },
   {
@@ -127,23 +133,35 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from Neural_network import Generative_net, Generative_net_test, ResidualEMNSBlock_3d, BigBlock, weight_init, eMNS_Dataset\n",
-    "from Training_loop import train_part_GM,get_mean_of_dataloader\n",
-    "from tqdm import tqdm\n",
-    "\n",
+    "from Neural_network import Generative_net,Generative_net_test ,ResidualEMNSBlock_3d, BigBlock, weight_init, eMNS_Dataset\n",
     "###############################################\n",
     "# Config the neural network\n",
     "###############################################\n",
     "num_input = 8\n",
     "output_shape = (3,16,16,16)\n",
-    "SB_args = (64,64,4,1) # (Cin, Cout, num_repeat, num_block)\n",
-    "BB_args = (2,2) # (scale_factor, num_block)\n",
+
+    "SB_args = (64,64,1,4) # (Cin, Cout, num_repeat, num_block)\n",
+    "BB_args = (2,3) # (scale_factor, num_block)\n",
+
     "SB_block = ResidualEMNSBlock_3d \n",
     "BB_block = BigBlock\n",
     "DF = False # whether using divergence free model\n",
     "\n",
-    "Generative_network = Generative_net(SB_args, BB_args, SB_block, BB_block, num_input=num_input, output_shape= output_shape)\n",
-    "print(Generative_network)"
+    "Generative_network = Generative_net_test(SB_args, BB_args, SB_block, BB_block, num_input=num_input, output_shape= output_shape)\n",
+    "print(Generative_network)\n",
+    "\n",
+    "from torchviz import make_dot\n",
+    "import torch.nn.functional as F\n",
+    "from Training_loop import grad_loss_Jacobain\n",
+    "x = torch.randn(2,8)\n",
+    "y = Bfield[0:2]\n",
+    "preds = Generative_network(x)\n",
+    "print(preds.shape)\n",
+    "loss =   F.l1_loss(preds,y)+grad_loss_Jacobain(preds,y)\n",
+    "        # optimizer.zero_grad() #zero out all of gradient\n",
+    "loss.backward()\n",
+    "\n",
+    "make_dot(loss, params=dict(Generative_network.named_parameters()))\n"
    ]
   },
   {
@@ -152,7 +170,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from Neural_network import Generative_net, Generative_net_test, ResidualEMNSBlock_3d, BigBlock, weight_init, eMNS_Dataset\n",
+    "from Neural_network import Generative_net, ResidualEMNSBlock_3d, BigBlock, weight_init, eMNS_Dataset\n",
     "from Training_loop import train_part_GM,get_mean_of_dataloader\n",
     "from tqdm import tqdm\n",
     "\n",
@@ -174,13 +192,13 @@
     "DF = False # whether using divergence free model\n",
     "\n",
     "Generative_network = Generative_net(SB_args, BB_args, SB_block, BB_block, num_input=num_input, output_shape= output_shape)\n",
-    "epochs = 400\n",
+    "epochs = 350\n",
     "learning_rate_decay = .5\n",
-    "learning_rates = [1e-4]\n",
+    "learning_rates = [1e-5]\n",
     "RMSE_lr = []\n",
     "schedule = []\n",
     "linear_lr = False\n",
-    "weight_decays = [0]\n",
+    "weight_decays = [1e-3]\n",
     "\n",
     "train_percents = np.arange(1.0,1.01,0.1)\n",
     "RMSE_history_end = np.zeros(len(train_percents))\n",
@@ -233,11 +251,16 @@
     "    mse_val_history_end[index] = mse_val_history[epoch_stop]\n",
     "    index=index+1\n",
     "    print('training stop at epoch:',epoch_stop)\n",
-    "    print('training stop at epoch:',Rsquare)\n",
-    "torch.save(Generative_network, 'EMS_CNN.pt')\t# 这里会存储迄今最优模型的参数\n",
-    "print(RMSE_lr)\n",
-    "print(learning_rates)\n",
-    "print(RMSE_lr[0],learning_rates[0])\n"
+    "    print('training stop at epoch:',Rsquare)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(Generative_network, 'EMS_CNN_ETH.pt')\t# 这里会存储迄今最优模型的参数"
    ]
   },
   {
@@ -246,6 +269,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "\n",
+    "print(RMSE_lr)\n",
+    "print(learning_rates)\n",
+    "print(RMSE_lr[0],learning_rates[0])\n",
+    "import matplotlib.pyplot as plt \n",
+    "plt.plot(learning_rates,RMSE_lr)\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "ave_site = 5\n",
@@ -259,6 +288,7 @@
     "plt.legend(['loss','loss_conv'])\n",
     "plt.xlabel('iterations')\n",
     "plt.ylabel('loss')\n",
+    "plt.ylim([0,1])\n",
     "plt.show()\n",
     "\n",
     "plt.title('Train and Val RMSE(sample_num=1000)')\n",
@@ -270,6 +300,7 @@
     "plt.legend(['train CNN','val CNN'])\n",
     "plt.xlabel('iterations')\n",
     "plt.ylabel('RMSE(mT)')\n",
+    "plt.ylim([0,100])\n",
     "plt.grid()\n",
     "plt.show()\n",
     "\n",
@@ -281,7 +312,8 @@
     "plt.ylabel('mse(mT^2)')\n",
     "plt.grid()\n",
     "plt.show()\n",
-    "print(epoch_stop)\n"
+    "print(epoch_stop)\n",
+    "\n"
    ]
   },
   {
@@ -310,7 +342,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,

diff --git a/Modeling eMNS/Generative_model_ETH_v2.ipynb b/Modeling eMNS/Generative_model_ETH_v2.ipynb
@@ -7,6 +7,15 @@
     "### Train ETH data to CNN generative network"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -U \"ray[data,train,tune,serve]\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -95,7 +104,7 @@
    "outputs": [],
    "source": [
     "from Neural_network import Generative_net, Generative_net_test, ResidualEMNSBlock_3d, BigBlock, weight_init, eMNS_Dataset\n",
-    "from Training_loop_v2 import train_GM\n",
+    "from Training_loop_v2 import train_GM, train_GM_ray\n",
     "from functools import partial\n",
     "from ray.train import RunConfig, ScalingConfig, CheckpointConfig\n",
     "from ray.train.torch import TorchTrainer\n",
@@ -116,17 +125,17 @@
     "extremes = dataset.train_norm(train_indices = train_set.indices)\n",
     "\n",
     "tune_schedule = ASHAScheduler(\n",
-    "        metric=\"loss\", # metric to optimize. This metric should be reported with tune.report()\n",
+    "        metric=\"rmse_val\", # metric to optimize. This metric should be reported with tune.report()\n",
     "        mode=\"min\",\n",
-    "        max_t=10,\n",
-    "        grace_period=1, # minimum stop epoch\n",
+    "        max_t=350,\n",
+    "        grace_period=10, # minimum stop epoch\n",
     "        reduction_factor=2,\n",
     "    )\n",
     "param_space = {\n",
     "    \"scaling_config\": ScalingConfig(\n",
     "        num_workers = 1,\n",
-    "        use_gpu = False,\n",
-    "        #resource_per_worker = {\"CPU\":1, \"GPU\":1}\n",
+    "        use_gpu = use_gpu,\n",
+    "        resources_per_worker = {\"CPU\":4, \"GPU\":1}\n",
     "    ),\n",
     "    # You can even grid search various datasets in Tune.\n",
     "    # \"datasets\": {\n",
@@ -135,19 +144,19 @@
     "    #     ),\n",
     "    # },\n",
     "    \"train_loop_config\": {\n",
-    "                'epochs': tune.choice([10]),\n",
-    "                'lr_max': tune.loguniform(1e-4,1e-2),\n",
-    "                'lr_min': tune.loguniform(1e-5,1e-7),\n",
-    "                'batch_size': tune.choice([4,8,16]),\n",
-    "                'L2_norm'   : tune.choice([0]),\n",
+    "                'epochs': 350,\n",
+    "                'lr_max': 1e-4,\n",
+    "                'lr_min': 2.5e-6,\n",
+    "                'batch_size': 8,\n",
+    "                'L2_norm'   : 0,\n",
     "                'verbose': False,\n",
-    "                'DF'     : tune.choice([True,False]),\n",
+    "                'DF'     : False,\n",
     "                'schedule': [],\n",
     "                'grid_space': 16**3,\n",
     "                'learning_rate_decay': 0.5,\n",
-    "                'skip_spacing': tune.choice([1,2,4]),\n",
-    "                'num_repeat'  : tune.choice([1,2,4]),\n",
-    "                'num_block'   : tune.choice([1,2,3]),\n",
+    "                'skip_spacing': tune.grid_search([1,2,4]),\n",
+    "                'num_repeat'  : tune.grid_search([1,2,4]),\n",
+    "                'num_block'   : tune.grid_search([1,2,3]),\n",
     "                'maxB'        : extremes[2],\n",
     "                'minB'        : extremes[3],\n",
     "                'train_set'   : train_set,\n",
@@ -170,7 +179,7 @@
     "################################################\n",
     "\n",
     "train_loop_config = {\n",
-    "                'epochs': 10,\n",
+    "                'epochs': 350,\n",
     "                'lr_max': 1e-4,\n",
     "                'lr_min': 2.5e-6,\n",
     "                'batch_size': 8,\n",
@@ -197,7 +206,7 @@
     "scaling_config = ScalingConfig(\n",
     "    num_workers = 1,\n",
     "    use_gpu = use_gpu,\n",
-    "    #resource_per_worker = {\"CPU\":1, \"GPU\":1}\n",
+    "    resources_per_worker = {\"CPU\":8, \"GPU\":2}\n",
     ")\n",
     "\n",
     "run_config = RunConfig(checkpoint_config=CheckpointConfig(num_to_keep=1))\n",
@@ -212,18 +221,40 @@
     "    run_config = run_config,\n",
     "\n",
     ")\n",
+    "\n",
     "result = trainer.fit()\n",
     "# tuner = tune.Tuner(\n",
     "#     trainer,\n",
     "#     param_space = param_space,\n",
     "#     tune_config =tune.TuneConfig(\n",
     "#         scheduler=tune_schedule,\n",
-    "#         num_samples=10, # number of samples of hyperparameter space\n",
+    "#         num_samples=1, # number of samples of hyperparameter space\n",
     "#     ),\n",
     "#     # run_config = RunConfig(storage_path=\"./results\", name=\"test_experiment\")\n",
     "# )\n",
     "    \n",
-    "# tuner.fit()"
+    "# results = tuner.fit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(torch.device(type='cuda', index=0))\n",
+    "print(ray.train.torch.get_device())\n",
+    "print(torch.device('cuda:0'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_result = results.get_best_result(metric='rmse_val',mode='min')\n",
+    "print(best_result)"
    ]
   },
   {
@@ -233,7 +264,16 @@
    "outputs": [],
    "source": [
     "from utils import plot_ray_results\n",
-    "plot_ray_results(result, metrics_names=['rmse_train','rmse_val'])"
+    "plot_ray_results(best_result, metrics_names=['rmse_train','rmse_val'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_ray_results(result, metrics_names=['rmse_train','rmse_val'],ylim=[0,25])"
    ]
   },
   {
@@ -272,9 +312,9 @@
     "    'schedule': [],\n",
     "    'grid_space': 16**3,\n",
     "    'learning_rate_decay': 0.5,\n",
-    "    'skip_spacing': 1,\n",
-    "    'num_repeat'  : 4,\n",
-    "    'num_block'   : 2,\n",
+    "    'skip_spacing': 2,\n",
+    "    'num_repeat'  : 2,\n",
+    "    'num_block'   : 3,\n",
     "    'device'      : device,\n",
     "}\n",
     "train_percents = np.arange(1.0,1.01,0.1)\n",
@@ -359,7 +399,7 @@
     "plt.legend(['loss','loss_conv'])\n",
     "plt.xlabel('iterations')\n",
     "plt.ylabel('loss')\n",
-    "# plt.ylim([0,10])\n",
+    "plt.ylim([0,10])\n",
     "plt.show()\n",
     "\n",
     "plt.title('Train and Val RMSE(sample_num=1000)')\n",
@@ -404,7 +444,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.9.7"
   }
  },
  "nbformat": 4,