| cd /mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA | |
| echo "$PWD" | |
| ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`) | |
| port=${ports[0]} | |
| echo "total workers: ${ARNOLD_WORKER_NUM}" | |
| echo "cur worker id: ${ARNOLD_ID}" | |
| echo "gpus per worker: ${ARNOLD_WORKER_GPU}" | |
| echo "master ip: ${METIS_WORKER_0_HOST}" | |
| echo "master port: ${port}" | |
| #export OMP_NUM_THREADS=8 | |
| #export NCCL_IB_DISABLE=0 | |
| #export NCCL_IB_GID_INDEX=3 | |
| #export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE} | |
| #export NCCL_SOCKET_IFNAME=eth0 | |
| # export NCCL_DEBUG=INFO | |
| env="$1" | |
| cmd="$2" | |
| echo $env | |
| echo $cmd | |
| if [ $env == "torchrun" ]; | |
| then | |
| torchrun \ | |
| --nnodes $ARNOLD_WORKER_NUM \ | |
| --node_rank $ARNOLD_ID \ | |
| --nproc_per_node $ARNOLD_WORKER_GPU \ | |
| --master_addr $METIS_WORKER_0_HOST \ | |
| --master_port $port \ | |
| $cmd | |
| elif [ $env == "deepspeed" ]; | |
| then | |
| deepspeed \ | |
| --num_nodes $ARNOLD_WORKER_NUM \ | |
| --num_gpus $ARNOLD_WORKER_GPU \ | |
| --master_addr $METIS_WORKER_0_HOST \ | |
| --master_port $port \ | |
| $cmd | |
| fi | |
| #torchrun \ | |
| #--nnodes $ARNOLD_WORKER_NUM \ | |
| #--node_rank $ARNOLD_ID \ | |
| #--nproc_per_node $ARNOLD_WORKER_GPU \ | |
| #--master_addr $METIS_WORKER_0_HOST \ | |
| #--master_port $port \ | |
| #$cmd | |
| #for i in "$*"; do | |
| # echo $i | |
| # $i | |
| #done | |