RaushanTurganbay HF staff commited on
Commit
fbf5f1e
1 Parent(s): ea5c706

Upload LlavaNextVideoForConditionalGeneration

Browse files
config.json CHANGED
@@ -25,46 +25,173 @@
25
  1008
26
  ]
27
  ],
28
- "image_token_index": 64001,
29
- "model_type": "llava_next",
30
  "projector_hidden_act": "gelu",
31
  "spatial_pool_mode": "average",
32
- "spatial_pool_out_channels": 1024,
33
  "spatial_pool_stride": 2,
34
  "text_config": {
35
  "_name_or_path": "NousResearch/Nous-Hermes-2-Yi-34B",
 
36
  "architectures": [
37
  "LlamaForCausalLM"
38
  ],
 
 
 
 
 
 
 
 
 
 
 
 
39
  "eos_token_id": 7,
 
 
 
 
 
40
  "hidden_size": 7168,
 
 
 
 
 
41
  "intermediate_size": 20480,
 
 
 
 
 
 
 
 
42
  "max_position_embeddings": 4096,
 
 
43
  "model_type": "llama",
 
44
  "num_attention_heads": 56,
 
 
45
  "num_hidden_layers": 60,
46
  "num_key_value_heads": 8,
 
 
 
 
47
  "pad_token_id": 0,
 
 
 
 
 
 
 
 
48
  "rms_norm_eps": 1e-05,
 
49
  "rope_theta": 5000000.0,
 
 
 
 
 
 
 
 
 
 
50
  "torch_dtype": "bfloat16",
 
 
 
51
  "use_cache": false,
52
  "vocab_size": 64064
53
  },
54
  "tie_word_embeddings": false,
55
  "torch_dtype": "bfloat16",
56
- "transformers_version": "4.42.0.dev0",
57
  "use_image_newline_parameter": true,
58
- "video_token_index": 64000,
59
  "vision_config": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  "hidden_size": 1024,
 
 
 
 
61
  "image_size": 336,
 
 
62
  "intermediate_size": 4096,
 
 
 
 
 
 
 
 
 
 
63
  "model_type": "clip_vision_model",
 
64
  "num_attention_heads": 16,
 
 
 
65
  "num_hidden_layers": 24,
 
 
 
 
 
66
  "patch_size": 14,
 
 
67
  "projection_dim": 768,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  "vocab_size": 32000
69
  },
70
  "vision_feature_layer": -2,
 
25
  1008
26
  ]
27
  ],
28
+ "image_token_index": 64004,
29
+ "model_type": "llava_next_video",
30
  "projector_hidden_act": "gelu",
31
  "spatial_pool_mode": "average",
 
32
  "spatial_pool_stride": 2,
33
  "text_config": {
34
  "_name_or_path": "NousResearch/Nous-Hermes-2-Yi-34B",
35
+ "add_cross_attention": false,
36
  "architectures": [
37
  "LlamaForCausalLM"
38
  ],
39
+ "attention_bias": false,
40
+ "attention_dropout": 0.0,
41
+ "bad_words_ids": null,
42
+ "begin_suppress_tokens": null,
43
+ "bos_token_id": 1,
44
+ "chunk_size_feed_forward": 0,
45
+ "cross_attention_hidden_size": null,
46
+ "decoder_start_token_id": null,
47
+ "diversity_penalty": 0.0,
48
+ "do_sample": false,
49
+ "early_stopping": false,
50
+ "encoder_no_repeat_ngram_size": 0,
51
  "eos_token_id": 7,
52
+ "exponential_decay_length_penalty": null,
53
+ "finetuning_task": null,
54
+ "forced_bos_token_id": null,
55
+ "forced_eos_token_id": null,
56
+ "hidden_act": "silu",
57
  "hidden_size": 7168,
58
+ "id2label": {
59
+ "0": "LABEL_0",
60
+ "1": "LABEL_1"
61
+ },
62
+ "initializer_range": 0.02,
63
  "intermediate_size": 20480,
64
+ "is_decoder": false,
65
+ "is_encoder_decoder": false,
66
+ "label2id": {
67
+ "LABEL_0": 0,
68
+ "LABEL_1": 1
69
+ },
70
+ "length_penalty": 1.0,
71
+ "max_length": 20,
72
  "max_position_embeddings": 4096,
73
+ "min_length": 0,
74
+ "mlp_bias": false,
75
  "model_type": "llama",
76
+ "no_repeat_ngram_size": 0,
77
  "num_attention_heads": 56,
78
+ "num_beam_groups": 1,
79
+ "num_beams": 1,
80
  "num_hidden_layers": 60,
81
  "num_key_value_heads": 8,
82
+ "num_return_sequences": 1,
83
+ "output_attentions": false,
84
+ "output_hidden_states": false,
85
+ "output_scores": false,
86
  "pad_token_id": 0,
87
+ "prefix": null,
88
+ "pretraining_tp": 1,
89
+ "problem_type": null,
90
+ "pruned_heads": {},
91
+ "remove_invalid_values": false,
92
+ "repetition_penalty": 1.0,
93
+ "return_dict": true,
94
+ "return_dict_in_generate": false,
95
  "rms_norm_eps": 1e-05,
96
+ "rope_scaling": null,
97
  "rope_theta": 5000000.0,
98
+ "sep_token_id": null,
99
+ "suppress_tokens": null,
100
+ "task_specific_params": null,
101
+ "temperature": 1.0,
102
+ "tf_legacy_loss": false,
103
+ "tie_encoder_decoder": false,
104
+ "tie_word_embeddings": false,
105
+ "tokenizer_class": null,
106
+ "top_k": 50,
107
+ "top_p": 1.0,
108
  "torch_dtype": "bfloat16",
109
+ "torchscript": false,
110
+ "typical_p": 1.0,
111
+ "use_bfloat16": false,
112
  "use_cache": false,
113
  "vocab_size": 64064
114
  },
115
  "tie_word_embeddings": false,
116
  "torch_dtype": "bfloat16",
117
+ "transformers_version": "4.43.0.dev0",
118
  "use_image_newline_parameter": true,
119
+ "video_token_index": 64003,
120
  "vision_config": {
121
+ "_name_or_path": "",
122
+ "add_cross_attention": false,
123
+ "architectures": null,
124
+ "attention_dropout": 0.0,
125
+ "bad_words_ids": null,
126
+ "begin_suppress_tokens": null,
127
+ "bos_token_id": null,
128
+ "chunk_size_feed_forward": 0,
129
+ "cross_attention_hidden_size": null,
130
+ "decoder_start_token_id": null,
131
+ "diversity_penalty": 0.0,
132
+ "do_sample": false,
133
+ "early_stopping": false,
134
+ "encoder_no_repeat_ngram_size": 0,
135
+ "eos_token_id": null,
136
+ "exponential_decay_length_penalty": null,
137
+ "finetuning_task": null,
138
+ "forced_bos_token_id": null,
139
+ "forced_eos_token_id": null,
140
+ "hidden_act": "quick_gelu",
141
  "hidden_size": 1024,
142
+ "id2label": {
143
+ "0": "LABEL_0",
144
+ "1": "LABEL_1"
145
+ },
146
  "image_size": 336,
147
+ "initializer_factor": 1.0,
148
+ "initializer_range": 0.02,
149
  "intermediate_size": 4096,
150
+ "is_decoder": false,
151
+ "is_encoder_decoder": false,
152
+ "label2id": {
153
+ "LABEL_0": 0,
154
+ "LABEL_1": 1
155
+ },
156
+ "layer_norm_eps": 1e-05,
157
+ "length_penalty": 1.0,
158
+ "max_length": 20,
159
+ "min_length": 0,
160
  "model_type": "clip_vision_model",
161
+ "no_repeat_ngram_size": 0,
162
  "num_attention_heads": 16,
163
+ "num_beam_groups": 1,
164
+ "num_beams": 1,
165
+ "num_channels": 3,
166
  "num_hidden_layers": 24,
167
+ "num_return_sequences": 1,
168
+ "output_attentions": false,
169
+ "output_hidden_states": false,
170
+ "output_scores": false,
171
+ "pad_token_id": null,
172
  "patch_size": 14,
173
+ "prefix": null,
174
+ "problem_type": null,
175
  "projection_dim": 768,
176
+ "pruned_heads": {},
177
+ "remove_invalid_values": false,
178
+ "repetition_penalty": 1.0,
179
+ "return_dict": true,
180
+ "return_dict_in_generate": false,
181
+ "sep_token_id": null,
182
+ "suppress_tokens": null,
183
+ "task_specific_params": null,
184
+ "temperature": 1.0,
185
+ "tf_legacy_loss": false,
186
+ "tie_encoder_decoder": false,
187
+ "tie_word_embeddings": true,
188
+ "tokenizer_class": null,
189
+ "top_k": 50,
190
+ "top_p": 1.0,
191
+ "torch_dtype": null,
192
+ "torchscript": false,
193
+ "typical_p": 1.0,
194
+ "use_bfloat16": false,
195
  "vocab_size": 32000
196
  },
197
  "vision_feature_layer": -2,
generation_config.json CHANGED
@@ -3,6 +3,6 @@
3
  "bos_token_id": 1,
4
  "eos_token_id": 7,
5
  "pad_token_id": 0,
6
- "transformers_version": "4.42.0.dev0",
7
  "use_cache": false
8
  }
 
3
  "bos_token_id": 1,
4
  "eos_token_id": 7,
5
  "pad_token_id": 0,
6
+ "transformers_version": "4.43.0.dev0",
7
  "use_cache": false
8
  }
model-00001-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddca4170d76423435d65b809067de5b64c85d1d0f247b9eecca4091a063c0b78
3
  size 4990118704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:546ae2e072b086b35ede270c9ec23b2e5df713ed81d2bcc44b85641f2ac9fe1d
3
  size 4990118704
model-00015-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad0f6c433416181ed9cd169cacc7187a5d415384eceb04a220fae02071f8450c
3
  size 1505667840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7564f5cf1bf77fc4ecb4a9deaa14bd6dd0630658a37272acdd06c0b0da0ec3d8
3
  size 1505667840