Spaces:

Ahmadzei
/

RAG

Runtime error

App Files Files Community

RAG / chunked /nltk_chunking /_trainer /chunk_52.txt

Ahmadzei

added 3 more tables for large emb model

5fa1a76 over 1 year ago

raw

history blame contribute delete

2.53 kB

	For example, some example configurations you can setup are:

	yml
	compute_environment: LOCAL_MACHINE
	distributed_type: MULTI_GPU
	downcast_bf16: 'no'
	gpu_ids: all
	machine_rank: 0 #change rank as per the node
	main_process_ip: 192.168.20.1
	main_process_port: 9898
	main_training_function: main
	mixed_precision: fp16
	num_machines: 2
	num_processes: 8
	rdzv_backend: static
	same_network: true
	tpu_env: []
	tpu_use_cluster: false
	tpu_use_sudo: false
	use_cpu: false

	yml
	compute_environment: LOCAL_MACHINE
	distributed_type: FSDP
	downcast_bf16: 'no'
	fsdp_config:
	fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
	fsdp_backward_prefetch_policy: BACKWARD_PRE
	fsdp_forward_prefetch: true
	fsdp_offload_params: false
	fsdp_sharding_strategy: 1
	fsdp_state_dict_type: FULL_STATE_DICT
	fsdp_sync_module_states: true
	fsdp_transformer_layer_cls_to_wrap: BertLayer
	fsdp_use_orig_params: true
	machine_rank: 0
	main_training_function: main
	mixed_precision: bf16
	num_machines: 1
	num_processes: 2
	rdzv_backend: static
	same_network: true
	tpu_env: []
	tpu_use_cluster: false
	tpu_use_sudo: false
	use_cpu: false

	yml
	compute_environment: LOCAL_MACHINE
	deepspeed_config:
	deepspeed_config_file: /home/user/configs/ds_zero3_config.json
	zero3_init_flag: true
	distributed_type: DEEPSPEED
	downcast_bf16: 'no'
	machine_rank: 0
	main_training_function: main
	num_machines: 1
	num_processes: 4
	rdzv_backend: static
	same_network: true
	tpu_env: []
	tpu_use_cluster: false
	tpu_use_sudo: false
	use_cpu: false

	yml
	compute_environment: LOCAL_MACHINE
	deepspeed_config:
	gradient_accumulation_steps: 1
	gradient_clipping: 0.7
	offload_optimizer_device: cpu
	offload_param_device: cpu
	zero3_init_flag: true
	zero_stage: 2
	distributed_type: DEEPSPEED
	downcast_bf16: 'no'
	machine_rank: 0
	main_training_function: main
	mixed_precision: bf16
	num_machines: 1
	num_processes: 4
	rdzv_backend: static
	same_network: true
	tpu_env: []
	tpu_use_cluster: false
	tpu_use_sudo: false
	use_cpu: false

	The accelerate_launch command is the recommended way to launch your training script on a distributed system with Accelerate and [Trainer] with the parameters specified in config_file.yaml.