Muennighoff commited on
Commit
de1dac2
•
1 Parent(s): 7f89c55

Move models

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 14m100m100m/global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  2. 14m100m100m/global_step190/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3 -0
  3. 14m100m100m/global_step190/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3 -0
  4. 14m100m100m/global_step190/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3 -0
  5. 14m100m100m/global_step190/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3 -0
  6. 14m100m100m/global_step190/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3 -0
  7. 14m100m100m/global_step190/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3 -0
  8. 14m100m100m/global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  9. 14m100m100m/global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  10. 14m100m100m/global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  11. 14m100m100m/global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  12. 14m100m100m/global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  13. 14m100m100m/global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  14. 14m100m100m/global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  15. 14m100m100m/global_step190/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3 -0
  16. 14m100m100m/global_step190/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3 -0
  17. 14m100m100m/global_step190/layer_01-model_00-model_states.pt +3 -0
  18. 14m100m100m/global_step190/layer_03-model_00-model_states.pt +3 -0
  19. 14m100m100m/global_step190/layer_04-model_00-model_states.pt +3 -0
  20. 14m100m100m/global_step190/layer_05-model_00-model_states.pt +3 -0
  21. 14m100m100m/global_step190/layer_06-model_00-model_states.pt +3 -0
  22. 14m100m100m/global_step190/layer_08-model_00-model_states.pt +3 -0
  23. 14m100m100m/global_step190/mp_rank_00_model_states.pt +3 -0
  24. 14m100m100m/logs/3163413.err +301 -0
  25. 14m100m100m/logs/3163413.out +0 -0
  26. 14m100m100m/sbatch_14m100m100m.sh +168 -0
  27. 14m100m100m/sbatch_14m100m100mval.sh +167 -0
  28. 14m100m100m/tensorboard_14m100m100m/events.out.tfevents.1677508517.nid006613.13417.0 +3 -0
  29. 14m100m100m/tensorboard_14m100m100mval/events.out.tfevents.1677509121.nid007495.92522.0 +3 -0
  30. 14m100m100m/tensorboard_14m100m100mval/events.out.tfevents.1678800406.nid007281.60225.0 +3 -0
  31. {14m1b5100mv → 14m1b5100m}/3307602.err +0 -0
  32. {14m1b5100mv → 14m1b5100m}/3307602.out +0 -0
  33. {14m7b5100mv → 14m7b5100m}/3307761.err +0 -0
  34. {14m7b5100mv → 14m7b5100m}/3307761.out +0 -0
  35. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  36. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +3 -0
  37. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +3 -0
  38. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt +3 -0
  39. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt +3 -0
  40. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt +3 -0
  41. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt +3 -0
  42. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt +3 -0
  43. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt +3 -0
  44. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt +3 -0
  45. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt +3 -0
  46. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3 -0
  47. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt +3 -0
  48. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt +3 -0
  49. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt +3 -0
  50. 1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt +3 -0
14m100m100m/global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c628e1090f463618347286f69aa3e507204d302358e32a025d0b65c8f9545ca
3
+ size 10614999
14m100m100m/global_step190/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c012dab7e55337e0b0aff2b427e9969cb29681d61877181a522bc0199314f0e8
3
+ size 10615074
14m100m100m/global_step190/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2060669813e272a71abd4140a4fd2557c426a8863389b67d1e8912dd90bfe11f
3
+ size 10615138
14m100m100m/global_step190/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1948d595de13f97ca9a54ddc2eeb8b1504fcff641d41bbb5ddceb9996bc09124
3
+ size 10615010
14m100m100m/global_step190/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e26dbb67778c4d56ce590294d707e37ec02ab5f3832886b610fce694e969e64
3
+ size 10615202
14m100m100m/global_step190/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44767c6dcded60bb44da8040750076543902878e809f045f5ce161dc32337dac
3
+ size 10615202
14m100m100m/global_step190/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9aafd26fc2d64f0ca2e93f103a31013eb707973522242df74cdb75e241f4a4f
3
+ size 10615266
14m100m100m/global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faa9c102a2087d607eba917b620d569c1b02b7d4a809de60f98fea51bfbe1913
3
+ size 10615063
14m100m100m/global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ceb01cf4f17188679e552effa47ab7dd99039439ba13ee0fef722adfeccdca06
3
+ size 10615127
14m100m100m/global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a19929e016dd32bd618e4f51667b913ccc4f04f99e63317350a3c97c09ca2804
3
+ size 10614999
14m100m100m/global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4865027f32f57ed28bbb022570e78683576a9b20aa707f304eb3125fed7a0677
3
+ size 10615063
14m100m100m/global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f4996f10f0fe2086dd9f09b633fe7f40f6d90dac0fa051dad19f64985a7d014
3
+ size 10615127
14m100m100m/global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a05b7bb4f6af4f24baef0f1cd40ba9253055212057dd5b8b761e121a0edfe8ed
3
+ size 10614999
14m100m100m/global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c7cc3611f3142efc1799549e0112b8bd317def2d887062f1548f4a8b6e6de74
3
+ size 10614999
14m100m100m/global_step190/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477bedcec57e414afffb3afd6ec93d50ebf7af37008dd494923b2569aeb3acbc
3
+ size 10614999
14m100m100m/global_step190/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:766117132e9d4b120aa3e6fadb6186850eac3d7b98bfbd5038413d7cd60fd6c2
3
+ size 10615191
14m100m100m/global_step190/layer_01-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36e37eeead5e761eb8043b259d2a8467e17a2441704d411126e391fdd80cda17
3
+ size 23454979
14m100m100m/global_step190/layer_03-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41bde257eb5783d1423d3d2f1a12e64fbbefe682f0fef37bf323dfb8e48cf717
3
+ size 1214403
14m100m100m/global_step190/layer_04-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da1358dcd3720b9d840026efe1c62aef85c0d17e450b105b217c515bb67765b
3
+ size 1214403
14m100m100m/global_step190/layer_05-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86b64c8a8b36c4b168a33724d2761bb787e12d0182870e6632a653dfa1fb00d7
3
+ size 1214403
14m100m100m/global_step190/layer_06-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7338d4f71a29dbbecbf090abf6f0be3ac6c1118839e96b4b15fc23b7a1aa00b3
3
+ size 1214403
14m100m100m/global_step190/layer_08-model_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dec12d04b1ce30ef368fc14028401fab453e8086e92983dfc8347e293631aef7
3
+ size 2115
14m100m100m/global_step190/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93fdb4efd17f0cff9b3a9a672e07fede6681c70783e1ccd383daa62abeb6608c
3
+ size 26995
14m100m100m/logs/3163413.err ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0: 2023-02-27 16:44:48.450670: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
2
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
3
+ 0: 2023-02-27 16:44:48.450675: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
4
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
5
+ 0: 2023-02-27 16:44:48.450684: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
6
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
7
+ 0: 2023-02-27 16:44:48.450679: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
8
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
9
+ 0: 2023-02-27 16:44:48.450684: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
10
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
11
+ 0: 2023-02-27 16:44:48.450682: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
12
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
13
+ 0: 2023-02-27 16:44:48.450688: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
14
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
15
+ 0: 2023-02-27 16:44:48.450691: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
16
+ 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
17
+ 1: 2023-02-27 16:44:48.526135: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
18
+ 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
19
+ 1: 2023-02-27 16:44:48.526142: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
20
+ 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
21
+ 1: 2023-02-27 16:44:48.526148: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
22
+ 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
23
+ 1: 2023-02-27 16:44:48.526151: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
24
+ 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
25
+ 1: 2023-02-27 16:44:48.526146: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
26
+ 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
27
+ 1: 2023-02-27 16:44:48.526154: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
28
+ 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
29
+ 1: 2023-02-27 16:44:48.526160: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
30
+ 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
31
+ 1: 2023-02-27 16:44:48.526159: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
32
+ 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
33
+ 0: 2023-02-27 16:44:50.091673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
34
+ 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
35
+ 0: 2023-02-27 16:44:50.091674: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
36
+ 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
37
+ 0: 2023-02-27 16:44:50.091682: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
38
+ 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
39
+ 0: 2023-02-27 16:44:50.091694: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
40
+ 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
41
+ 0: 2023-02-27 16:44:50.091689: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
42
+ 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
43
+ 0: 2023-02-27 16:44:50.091690: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
44
+ 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
45
+ 0: 2023-02-27 16:44:50.091699: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
46
+ 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
47
+ 0: 2023-02-27 16:44:50.091691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
48
+ 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
49
+ 0: 2023-02-27 16:44:50.092026: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
50
+ 0: 2023-02-27 16:44:50.092029: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
51
+ 0: 2023-02-27 16:44:50.092033: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
52
+ 0: 2023-02-27 16:44:50.092037: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
53
+ 0: 2023-02-27 16:44:50.092039: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
54
+ 0: 2023-02-27 16:44:50.092040: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
55
+ 0: 2023-02-27 16:44:50.092041: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
56
+ 0: 2023-02-27 16:44:50.092044: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
57
+ 1: 2023-02-27 16:44:50.240535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
58
+ 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
59
+ 1: 2023-02-27 16:44:50.240536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
60
+ 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
61
+ 1: 2023-02-27 16:44:50.240540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
62
+ 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
63
+ 1: 2023-02-27 16:44:50.240542: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
64
+ 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
65
+ 1: 2023-02-27 16:44:50.240545: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
66
+ 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
67
+ 1: 2023-02-27 16:44:50.240543: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
68
+ 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
69
+ 1: 2023-02-27 16:44:50.240733: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
70
+ 1: 2023-02-27 16:44:50.240735: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
71
+ 1: 2023-02-27 16:44:50.240546: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
72
+ 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
73
+ 1: 2023-02-27 16:44:50.240540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200
74
+ 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
75
+ 1: 2023-02-27 16:44:50.240742: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
76
+ 1: 2023-02-27 16:44:50.240743: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
77
+ 1: 2023-02-27 16:44:50.240749: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
78
+ 1: 2023-02-27 16:44:50.240750: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
79
+ 1: 2023-02-27 16:44:50.240752: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
80
+ 1: 2023-02-27 16:44:50.240751: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
81
+ 0: 2023-02-27 16:44:55.272694: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
82
+ 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
83
+ 1: 2023-02-27 16:44:55.272688: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
84
+ 0: 2023-02-27 16:44:55.272700: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
85
+ 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
86
+ 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
87
+ 1: 2023-02-27 16:44:55.272697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
88
+ 0: 2023-02-27 16:44:55.272706: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
89
+ 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
90
+ 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
91
+ 1: 2023-02-27 16:44:55.272694: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
92
+ 0: 2023-02-27 16:44:55.272706: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
93
+ 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
94
+ 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
95
+ 1: 2023-02-27 16:44:55.272695: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
96
+ 0: 2023-02-27 16:44:55.272709: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
97
+ 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
98
+ 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
99
+ 1: 2023-02-27 16:44:55.272701: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
100
+ 0: 2023-02-27 16:44:55.272711: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
101
+ 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
102
+ 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
103
+ 1: 2023-02-27 16:44:55.272699: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
104
+ 0: 2023-02-27 16:44:55.272713: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
105
+ 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
106
+ 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
107
+ 1: 2023-02-27 16:44:55.272703: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
108
+ 0: 2023-02-27 16:44:55.272717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
109
+ 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
110
+ 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
111
+ 1: 2023-02-27 16:44:55.272702: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125
112
+ 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
113
+ 1: 2023-02-27 16:44:55.274738: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
114
+ 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
115
+ 1: 2023-02-27 16:44:55.274737: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
116
+ 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
117
+ 1: 2023-02-27 16:44:55.274738: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
118
+ 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
119
+ 1: 2023-02-27 16:44:55.274740: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
120
+ 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
121
+ 1: 2023-02-27 16:44:55.274742: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
122
+ 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
123
+ 1: 2023-02-27 16:44:55.274743: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
124
+ 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
125
+ 1: 2023-02-27 16:44:55.274756: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
126
+ 1: 2023-02-27 16:44:55.274752: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
127
+ 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
128
+ 1: 2023-02-27 16:44:55.274757: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
129
+ 1: 2023-02-27 16:44:55.274756: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
130
+ 1: 2023-02-27 16:44:55.274759: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
131
+ 1: 2023-02-27 16:44:55.274758: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
132
+ 1: 2023-02-27 16:44:55.274758: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
133
+ 1: 2023-02-27 16:44:55.274768: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
134
+ 1: 2023-02-27 16:44:55.274797: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
135
+ 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
136
+ 1: 2023-02-27 16:44:55.274809: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
137
+ 0: 2023-02-27 16:44:55.274653: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
138
+ 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
139
+ 0: 2023-02-27 16:44:55.274654: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
140
+ 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
141
+ 0: 2023-02-27 16:44:55.274655: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
142
+ 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
143
+ 0: 2023-02-27 16:44:55.274658: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
144
+ 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
145
+ 0: 2023-02-27 16:44:55.274662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
146
+ 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
147
+ 0: 2023-02-27 16:44:55.274662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
148
+ 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
149
+ 0: 2023-02-27 16:44:55.274663: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
150
+ 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
151
+ 0: 2023-02-27 16:44:55.274669: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
152
+ 0: 2023-02-27 16:44:55.274664: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro
153
+ 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64
154
+ 0: 2023-02-27 16:44:55.274671: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
155
+ 0: 2023-02-27 16:44:55.274671: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
156
+ 0: 2023-02-27 16:44:55.274673: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
157
+ 0: 2023-02-27 16:44:55.274680: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
158
+ 0: 2023-02-27 16:44:55.274683: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
159
+ 0: 2023-02-27 16:44:55.274684: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
160
+ 0: 2023-02-27 16:44:55.274686: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
161
+ 0: Successfully preprocessed all matching files.
162
+ 0: Detected CUDA files, patching ldflags
163
+ 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
164
+ 0: Building extension module scaled_upper_triang_masked_softmax_cuda...
165
+ 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
166
+ 0: Loading extension module scaled_upper_triang_masked_softmax_cuda...
167
+ 0: Successfully preprocessed all matching files.
168
+ 0: Detected CUDA files, patching ldflags
169
+ 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
170
+ 0: Building extension module scaled_masked_softmax_cuda...
171
+ 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
172
+ 0: Loading extension module scaled_masked_softmax_cuda...
173
+ 0: Successfully preprocessed all matching files.
174
+ 0: Detected CUDA files, patching ldflags
175
+ 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja...
176
+ 0: Building extension module fused_mix_prec_layer_norm_cuda...
177
+ 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
178
+ 0: Loading extension module fused_mix_prec_layer_norm_cuda...
179
+ 0: Successfully preprocessed all matching files.
180
+ 0: Successfully preprocessed all matching files.
181
+ 0: Successfully preprocessed all matching files.
182
+ 0: Successfully preprocessed all matching files.
183
+ 0: Successfully preprocessed all matching files.
184
+ 0: Successfully preprocessed all matching files.
185
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
186
+ 0: warnings.warn(
187
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
188
+ 0: warnings.warn(
189
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
190
+ 0: warnings.warn(
191
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
192
+ 0: warnings.warn(
193
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
194
+ 0: warnings.warn(
195
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
196
+ 0: warnings.warn(
197
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
198
+ 0: warnings.warn(
199
+ 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
200
+ 1: warnings.warn(
201
+ 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
202
+ 1: warnings.warn(
203
+ 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
204
+ 1: warnings.warn(
205
+ 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
206
+ 1: warnings.warn(
207
+ 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
208
+ 1: warnings.warn(
209
+ 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
210
+ 1: warnings.warn(
211
+ 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
212
+ 1: warnings.warn(
213
+ 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
214
+ 1: warnings.warn(
215
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead
216
+ 0: warnings.warn(
217
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
218
+ 0:
219
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
220
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
221
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
222
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
223
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
224
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
225
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
226
+ 1:
227
+ 1:
228
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
229
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
230
+ 1:
231
+ 1:
232
+ 0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja...
233
+ 0: Building extension module utils...
234
+ 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
235
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
236
+ 0: Loading extension module utils...
237
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
238
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
239
+ 0: Loading extension module utils...
240
+ 0: Loading extension module utils...
241
+ 0: Loading extension module utils...
242
+ 0: Loading extension module utils...Loading extension module utils...
243
+ 0:
244
+ 0: Loading extension module utils...
245
+ 0: Loading extension module utils...
246
+ 0: Loading extension module utils...
247
+ 1: Loading extension module utils...
248
+ 1: Loading extension module utils...
249
+ 1: Loading extension module utils...
250
+ 1: Loading extension module utils...
251
+ 1: Loading extension module utils...
252
+ 1: Loading extension module utils...
253
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
254
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
255
+ 0: Loading extension module utils...
256
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
257
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
258
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
259
+ 0: Loading extension module utils...
260
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
261
+ 0: Loading extension module utils...
262
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
263
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
264
+ 0: Loading extension module utils...
265
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
266
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
267
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
268
+ 0: Loading extension module utils...
269
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
270
+ 0: Loading extension module utils...
271
+ 1: Loading extension module utils...
272
+ 1: Loading extension module utils...
273
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
274
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
275
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
276
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
277
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
278
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
279
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
280
+ 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
281
+ 1: No modifications detected for re-loaded extension module utils, skipping build step...
282
+ 1: Loading extension module utils...
283
+ 1: No modifications detected for re-loaded extension module utils, skipping build step...
284
+ 1: Loading extension module utils...
285
+ 1: No modifications detected for re-loaded extension module utils, skipping build step...
286
+ 1: Loading extension module utils...
287
+ 1: No modifications detected for re-loaded extension module utils, skipping build step...
288
+ 1: Loading extension module utils...
289
+ 1: No modifications detected for re-loaded extension module utils, skipping build step...
290
+ 1: Loading extension module utils...
291
+ 1: No modifications detected for re-loaded extension module utils, skipping build step...
292
+ 1: Loading extension module utils...
293
+ 1: No modifications detected for re-loaded extension module utils, skipping build step...
294
+ 1: Loading extension module utils...
295
+ 1: No modifications detected for re-loaded extension module utils, skipping build step...
296
+ 1: Loading extension module utils...
297
+ 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...
298
+ 0: No modifications detected for re-loaded extension module utils, skipping build step...
299
+ 0: Loading extension module utils...
300
+ 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings
301
+ 0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings")
14m100m100m/logs/3163413.out ADDED
The diff for this file is too large to render. See raw diff
 
14m100m100m/sbatch_14m100m100m.sh ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
3
+ #SBATCH --nodes=2
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=32
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p standard-g
8
+ #SBATCH -t 2-0:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=14m100m100m
17
+
18
+ # if run without sbatch, invoke here
19
+ if [ -z $SLURM_JOB_ID ]; then
20
+ mkdir -p logs
21
+ sbatch "$0"
22
+ exit
23
+ fi
24
+
25
+ set -euo pipefail
26
+
27
+ # symlink logs/latest.out and logs/latest.err
28
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
29
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
30
+
31
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
32
+ CHECKPOINT_PATH=checkpoints_$VARIANT
33
+ TENSORBOARD_PATH=tensorboard_$VARIANT
34
+ mkdir -p $CHECKPOINT_PATH
35
+ mkdir -p $TENSORBOARD_PATH
36
+
37
+ # Data
38
+ VOCAB_FILE="gpt2/vocab.json"
39
+ MERGE_FILE="gpt2/merges.txt"
40
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
41
+ TRAIN_DATA_PATH=train100m.txt
42
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document"
43
+ VALID_DATA_PATH=val.txt
44
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
45
+
46
+
47
+ PP_SIZE=1
48
+ TP_SIZE=1
49
+
50
+ MICRO_BATCH_SIZE=16
51
+ GRADIENT_ACCUMULATION_STEPS=1
52
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
53
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
54
+
55
+ # Model parameters
56
+ source model_params.sh
57
+ MODEL_PARAM=("${PARAM_14M[@]}")
58
+ NHIDDEN=${MODEL_PARAM[0]}
59
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
60
+ KV_SIZE=${MODEL_PARAM[2]}
61
+ NHEADS=${MODEL_PARAM[3]}
62
+ NLAYERS=${MODEL_PARAM[4]}
63
+ SEQ_LEN=2048
64
+
65
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
66
+
67
+ SAVE_INTERVAL=10000
68
+
69
+ # Tokens: 100 000 000
70
+ # -> Samples: 48828.125
71
+ TRAIN_SAMPLES=48_828
72
+
73
+ OPTIMIZER_ARGS=" \
74
+ --optimizer adam \
75
+ --adam-beta1 0.9 \
76
+ --adam-beta2 0.999 \
77
+ --adam-eps 1e-8 \
78
+ --lr 2e-4 \
79
+ --min-lr 2e-5 \
80
+ --lr-decay-style cosine \
81
+ --lr-decay-samples $TRAIN_SAMPLES \
82
+ --lr-warmup-samples 488 \
83
+ --clip-grad 1.0 \
84
+ --weight-decay 1e-1 \
85
+ "
86
+
87
+ GPT_ARGS=" \
88
+ --num-layers $NLAYERS \
89
+ --hidden-size $NHIDDEN \
90
+ --num-attention-heads $NHEADS \
91
+ --kv-channels $KV_SIZE \
92
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
93
+ --seq-length $SEQ_LEN \
94
+ --max-position-embeddings $SEQ_LEN \
95
+ --micro-batch-size $MICRO_BATCH_SIZE \
96
+ --global-batch-size $GLOBAL_BATCH_SIZE \
97
+ --train-samples $TRAIN_SAMPLES \
98
+ --vocab-file $VOCAB_FILE \
99
+ --merge-file $MERGE_FILE \
100
+ --loss-scale 12 \
101
+ --clip-grad 1.0 \
102
+ --kill-switch-path $KILL_SWITCH_PATH \
103
+ --bf16 \
104
+ --checkpoint-activations \
105
+ $OPTIMIZER_ARGS \
106
+ "
107
+
108
+ OUTPUT_ARGS=" \
109
+ --log-interval 10 \
110
+ --save-interval $SAVE_INTERVAL \
111
+ --eval-interval 1000 \
112
+ --eval-iters 1 \
113
+ --tensorboard-dir $TENSORBOARD_PATH \
114
+ --tensorboard-queue-size 5 \
115
+ --log-timers-to-tensorboard \
116
+ --log-batch-size-to-tensorboard \
117
+ --log-validation-ppl-to-tensorboard \
118
+ "
119
+
120
+ ZERO_STAGE=0
121
+
122
+ mkdir -p ds_configs
123
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
124
+
125
+ cat <<EOF > $DS_CONFIG_PATH
126
+ {
127
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
128
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
129
+ "gradient_clipping": 1.0,
130
+ "zero_optimization": {
131
+ "stage": $ZERO_STAGE
132
+ },
133
+ "bf16": {
134
+ "enabled": true
135
+ },
136
+ "steps_per_print": 2000,
137
+ "wall_clock_breakdown": false
138
+ }
139
+ EOF
140
+
141
+ DEEPSPEED_ARGS=" \
142
+ --deepspeed \
143
+ --deepspeed_config $DS_CONFIG_PATH \
144
+ --zero-stage $ZERO_STAGE \
145
+ "
146
+
147
+ CMD=" \
148
+ Megatron-DeepSpeed/pretrain_gpt.py \
149
+ --tensor-model-parallel-size $TP_SIZE \
150
+ --pipeline-model-parallel-size $PP_SIZE \
151
+ $GPT_ARGS \
152
+ $OUTPUT_ARGS \
153
+ --save $CHECKPOINT_PATH \
154
+ --load $CHECKPOINT_PATH \
155
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
156
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
157
+ --data-impl mmap \
158
+ $DEEPSPEED_ARGS \
159
+ "
160
+
161
+ echo $CMD
162
+
163
+ echo "START $SLURM_JOBID: $(date)"
164
+
165
+ # bash launch_srun.sh $CMD
166
+ srun --label launch.sh $CMD
167
+
168
+ echo "END $SLURM_JOBID: $(date)"
14m100m100m/sbatch_14m100m100mval.sh ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
3
+ #SBATCH --nodes=2
4
+ #SBATCH --ntasks-per-node=1
5
+ #SBATCH --cpus-per-task=32
6
+ #SBATCH --mem=256G
7
+ #SBATCH -p small-g
8
+ #SBATCH -t 12:00:00
9
+ #SBATCH --gpus-per-node=mi250:8
10
+ #SBATCH --exclusive=user
11
+ #SBATCH --hint=nomultithread
12
+ #SBATCH --account=project_462000119
13
+ #SBATCH -o logs/%j.out
14
+ #SBATCH -e logs/%j.err
15
+
16
+ VARIANT=14m100m100mval
17
+ VARIANT_CKPT=14m100m100m
18
+
19
+ # if run without sbatch, invoke here
20
+ if [ -z $SLURM_JOB_ID ]; then
21
+ mkdir -p logs
22
+ sbatch "$0"
23
+ exit
24
+ fi
25
+
26
+ set -euo pipefail
27
+
28
+ # symlink logs/latest.out and logs/latest.err
29
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
30
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
31
+
32
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
33
+ CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT
34
+ TENSORBOARD_PATH=tensorboard_$VARIANT
35
+
36
+ # Data
37
+ VOCAB_FILE="gpt2/vocab.json"
38
+ MERGE_FILE="gpt2/merges.txt"
39
+ #DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document"
40
+ TRAIN_DATA_PATH=train100m.txt
41
+ # "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document"
42
+ VALID_DATA_PATH=val.txt
43
+ # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
44
+
45
+ PP_SIZE=1
46
+ TP_SIZE=1
47
+
48
+ MICRO_BATCH_SIZE=4
49
+ GRADIENT_ACCUMULATION_STEPS=1
50
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
51
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
52
+
53
+ # Model parameters
54
+ source model_params.sh
55
+ MODEL_PARAM=("${PARAM_14M[@]}")
56
+ NHIDDEN=${MODEL_PARAM[0]}
57
+ FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
58
+ KV_SIZE=${MODEL_PARAM[2]}
59
+ NHEADS=${MODEL_PARAM[3]}
60
+ NLAYERS=${MODEL_PARAM[4]}
61
+ SEQ_LEN=2048
62
+
63
+ echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
64
+
65
+ SAVE_INTERVAL=1000
66
+
67
+ # Tokens: 31633480000
68
+ # -> Samples: 15446035
69
+ TRAIN_SAMPLES=1
70
+
71
+ OPTIMIZER_ARGS=" \
72
+ --optimizer adam \
73
+ --adam-beta1 0.9 \
74
+ --adam-beta2 0.999 \
75
+ --adam-eps 1e-8 \
76
+ --lr 2e-4 \
77
+ --min-lr 2e-5 \
78
+ --lr-decay-style cosine \
79
+ --lr-decay-samples $TRAIN_SAMPLES \
80
+ --lr-warmup-samples 0 \
81
+ --clip-grad 1.0 \
82
+ --weight-decay 1e-1 \
83
+ --override-lr-scheduler \
84
+ --no-load-optim \
85
+ --reset-progress \
86
+ "
87
+
88
+ GPT_ARGS=" \
89
+ --num-layers $NLAYERS \
90
+ --hidden-size $NHIDDEN \
91
+ --num-attention-heads $NHEADS \
92
+ --kv-channels $KV_SIZE \
93
+ --ffn-hidden-size $FFN_HIDDEN_SIZE \
94
+ --seq-length $SEQ_LEN \
95
+ --max-position-embeddings $SEQ_LEN \
96
+ --micro-batch-size $MICRO_BATCH_SIZE \
97
+ --global-batch-size $GLOBAL_BATCH_SIZE \
98
+ --train-samples $TRAIN_SAMPLES \
99
+ --vocab-file $VOCAB_FILE \
100
+ --merge-file $MERGE_FILE \
101
+ --clip-grad 1.0 \
102
+ --kill-switch-path $KILL_SWITCH_PATH \
103
+ --bf16 \
104
+ $OPTIMIZER_ARGS \
105
+ "
106
+
107
+ OUTPUT_ARGS=" \
108
+ --log-interval 10 \
109
+ --save-interval $SAVE_INTERVAL \
110
+ --eval-interval 1 \
111
+ --eval-iters 100 \
112
+ --tensorboard-dir $TENSORBOARD_PATH \
113
+ --tensorboard-queue-size 5 \
114
+ --log-timers-to-tensorboard \
115
+ --log-batch-size-to-tensorboard \
116
+ --log-validation-ppl-to-tensorboard \
117
+ "
118
+
119
+ ZERO_STAGE=0
120
+
121
+ mkdir -p ds_configs
122
+ DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
123
+
124
+ cat <<EOF > $DS_CONFIG_PATH
125
+ {
126
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
127
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
128
+ "gradient_clipping": 1.0,
129
+ "zero_optimization": {
130
+ "stage": $ZERO_STAGE
131
+ },
132
+ "bf16": {
133
+ "enabled": true
134
+ },
135
+ "steps_per_print": 2000,
136
+ "wall_clock_breakdown": false
137
+ }
138
+ EOF
139
+
140
+ DEEPSPEED_ARGS=" \
141
+ --deepspeed \
142
+ --deepspeed_config $DS_CONFIG_PATH \
143
+ --zero-stage $ZERO_STAGE \
144
+ "
145
+
146
+ CMD=" \
147
+ Megatron-DeepSpeed/pretrain_gpt.py \
148
+ --tensor-model-parallel-size $TP_SIZE \
149
+ --pipeline-model-parallel-size $PP_SIZE \
150
+ $GPT_ARGS \
151
+ $OUTPUT_ARGS \
152
+ --save $CHECKPOINT_PATH \
153
+ --load $CHECKPOINT_PATH \
154
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
155
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
156
+ --data-impl mmap \
157
+ $DEEPSPEED_ARGS \
158
+ "
159
+
160
+ echo $CMD
161
+
162
+ echo "START $SLURM_JOBID: $(date)"
163
+
164
+ # bash launch_srun.sh $CMD
165
+ srun --label launch.sh $CMD
166
+
167
+ echo "END $SLURM_JOBID: $(date)"
14m100m100m/tensorboard_14m100m100m/events.out.tfevents.1677508517.nid006613.13417.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41e5422c9e2fcd0c5db2be12c2a71e1f9a96c5897ded7a3ffd50c01d25d5534a
3
+ size 355324
14m100m100m/tensorboard_14m100m100mval/events.out.tfevents.1677509121.nid007495.92522.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34a2c6f2333cf39f269d79aab690d59e95e286140cd83c0fc335adab3226351d
3
+ size 980
14m100m100m/tensorboard_14m100m100mval/events.out.tfevents.1678800406.nid007281.60225.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f4ba864573adf53dc8d0673c544f12cb5e07a2711b018e2a3d2266a17e4b03b
3
+ size 980
{14m1b5100mv → 14m1b5100m}/3307602.err RENAMED
File without changes
{14m1b5100mv → 14m1b5100m}/3307602.out RENAMED
File without changes
{14m7b5100mv → 14m7b5100m}/3307761.err RENAMED
File without changes
{14m7b5100mv → 14m7b5100m}/3307761.out RENAMED
File without changes
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c4b69f38a245da6e4d0513eccbb04c26c76b474abeac577ecbb3773aec6e629
3
+ size 51395415
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1b34567610692468ac21331c9e23ed6ed3863f8c63081e212bfb8879611667a
3
+ size 51395373
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7447a3a8cdabd9ec3e4b72f779d6492f1cfb529f2775357c0a489190d5f94795
3
+ size 51395437
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03291cf5118afad0a131fc7705594bd07bfbee912ee4defdac94fd682c678117
3
+ size 51395501
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf5081ba8de8e3d229a8e54fb43c192fb5c51ae6ffc8f4769a5ec2871c1253e9
3
+ size 51395565
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1979150e5ddc2df4bce1339ab00276fce8721cfbb4cfa29816d61a7dd65b0441
3
+ size 51395373
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df3a858ac35888b1acb0ed9db854b4eb7ffa7eb7de08532c91e87cdff58810eb
3
+ size 51395373
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd5dd0a03580e3d1cfd18d949c833097e194c9dffc9a661d108d5867a1fe71da
3
+ size 51395437
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d0237a57736cfdf4dfc5b4987d2c7ca3794164c42b93ce2d37cc6c00b001616
3
+ size 51395501
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4a57711df649bc7569868378a6aa543d4dbd02eb83fc6fc7720b5c8685e133
3
+ size 51395437
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:130656d77b8a7ac04a92747124612fe3abea6473a0ea55be3bc76bb3ab04386c
3
+ size 51395501
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:291bd2d97eab3485e7aab1a656533129d7f4233ea771a2fc172bbab641312b9f
3
+ size 51395426
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e121359c0a68dff23c7c5342db576d9af4aa6e6c831b7d916feb5baef63ef3a
3
+ size 51395373
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6755c268beff0fdfe0350c8c5f1d03ac320b9e167a426fbe9f935c04aa47389
3
+ size 51395437
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fbd4300577ba63e9fc50bd62e1720ae9f8a9251388c7b8e3119829f9dcf62d8
3
+ size 51395501
1b112b100m/1b112b100m/global_step23189/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23c999e23b8f2cdf65437f5147d246a3f2c39af746e52e25e898241331694510
3
+ size 51395501