#!/bin/bash
# This script starts up whisper workers appropriate for CPU core count

script_name=$(basename "${0}")
adm_log="/workspace/whisper/logs/waiAdmin.log"

# Write a log message to multiple places at same time
# $1 - log message text
# $2 - bitmask for log destination (0:stdout, 1:adm_log)
function log() {
    LOG_DEST=$2
    if (( (LOG_DEST & 0x1) == 0x1 )); then
        echo -e "$1"
    fi
    if (( (LOG_DEST & 0x2) == 0x2 )); then
        ts_str=$(date +"%Y-%m-%d %H:%M:%S,%3N")
        echo "${ts_str} - ${1}" >> $adm_log
    fi
}

# Make admin log file if it does not already exist
if [ ! -f $adm_log ]; then
    mkdir -p "${adm_log%/*}"
    touch $adm_log
fi
script_name=$(basename "${0}")

log "INFO - ${script_name} state: starting" 3

percentage_nonworker_cores=${1:-17} # nonworker_cores = percentage_nonworker_cores rounded to nearest multiple of 2. This means 17 is suitable to achieve 6 non-worker cores on a 32 core CPU
min_nonworker_cores=${2:-8} # nonworker_cores = max(min_nonworker_cores, nonworker_cores)
cores_per_worker=${3:-4}
percentage_memory=${4:-0.5} # Default to 50% of physical memory
memory_per_worker=${5:-4.4} # Default to 4.4 GB per worker

log "${script_name} - This script will start Netint Whipser AI Service with appropriate amount of workers for CPU of system" 1
log "arg1 is percentage_nonworker_cores (default: 17)" 1
log "arg2 is min_nonworker_cores (default: 8)" 1
log "arg3 is cores_per_worker (default: 4)" 1
log "arg4 is percentage_memory (default: 0.5)" 1
log "arg5 is memory_per_worker (default: 4.4)" 1

# Check args 1
if [ $percentage_nonworker_cores -lt 0 ] || [ $percentage_nonworker_cores -gt 100 ]; then
    log "ERROR - arg1 for percentage_nonworker_cores is not in range 0-100" 3
    log "INFO - ${script_name} state: start up failed" 3
    exit 1
fi

# Check args 4
if (( $(awk "BEGIN {print ($percentage_memory < 0)}") )) || (( $(awk "BEGIN {print ($percentage_memory > 1)}") )); then
    log "ERROR - arg4 for percentage_memory is not in range 0-1" 3
    log "INFO - ${script_name} state: start up failed" 3
    exit 1
fi

# Calculate the number of cores available
total_cores=$(nproc)
cores_reserved_for_system=$(( (((($percentage_nonworker_cores * $total_cores) / 100) + 1) / 2) * 2 ))
# Use below once basic-calculator installed into docker image
# cores_reserved_for_system=$(echo "scale=0; (((($percentage_nonworker_cores * $total_cores) / 200) + 0.5) / 1) * 2" | bc)

# Check args 2
if [ $min_nonworker_cores -lt 0 ] || [ $min_nonworker_cores -gt $total_cores ]; then
    log "ERROR - arg2 for min_nonworker_cores is not in range 0-${total_cores}" 3
    log "INFO - ${script_name} state: start up failed" 3
    exit 1
fi
if [ $cores_per_worker -lt 0 ] || [ $cores_per_worker -gt $total_cores ]; then
    log "ERROR - arg3 for cores_per_worker is not in range 0-${total_cores}" 3
    log "INFO - ${script_name} state: start up failed" 3
    exit 1
fi

if [ $cores_reserved_for_system -lt $min_nonworker_cores ]; then
    cores_reserved_for_system=$min_nonworker_cores
fi
available_cores=$((total_cores - cores_reserved_for_system))

if [ $available_cores -lt $cores_per_worker ]; then
    log "ERROR - Not enough cores available for at least one worker" 3
    log "INFO - ${script_name} state: start up failed" 3
    exit 1
fi

# Get total physical memory in GB
total_memory=$(free -g | awk '/^Mem:/{print $2}')
log "INFO - Total memory: ${total_memory} GB" 3

# Calculate memory reserved for system
memory_reserved_for_system=$(awk "BEGIN {print $total_memory * $percentage_memory}")
log "INFO - Memory reserved for system: ${memory_reserved_for_system} GB" 3

# Calculate number of workers based on memory
num_workers_based_on_memory=$(awk "BEGIN {print $memory_reserved_for_system / $memory_per_worker}")
log "INFO - Number of workers based on memory: ${num_workers_based_on_memory}" 3

# Calculate the final number of workers
num_workers=$(( available_cores / cores_per_worker ))
log "INFO - Number of workers based on cores: ${num_workers}" 3

num_workers=$(awk "BEGIN {print int(($num_workers < $num_workers_based_on_memory) ? $num_workers : $num_workers_based_on_memory)}")
log "INFO - Final number of workers: ${num_workers}" 3


# Check num_workers
if [ $num_workers -le 0 ]; then
    log "ERROR - configuration of args and CPU allows for 0 workers" 3
    log "INFO - ${script_name} state: start up failed" 3
    exit 1
fi

exe_name="./waiWorker.py"  # Adjust the executable name if needed

# Copy .py files
cd /workspace/whisper
cp /workspace/whisper/config/*.py .

# Start worker instances
for ((i = 0; i < num_workers; i++)); do
    cpu_list=""
    start_core=$((cores_reserved_for_system + i * cores_per_worker))
    end_core=$((start_core + cores_per_worker - 1))
    
    for ((j = start_core; j <= end_core; j++)); do
        cpu_list="$cpu_list $j"
    done

    cpu_list_trimmed="$(echo -e "${cpu_list}" | xargs)"
    cpu_first_cpu="${cpu_list_trimmed%% *}"
    cpu_last_cpu="${cpu_list_trimmed##* }"
    cpu_range="${cpu_first_cpu}-${cpu_last_cpu}"

    log "INFO - cpu_list: $cpu_list" 3
    log "INFO - cpu_list_trimmed: $cpu_list_trimmed" 3
    log "INFO - cpu_range: $cpu_range" 3

    AIO_NUMA_CPUS="$cpu_list_trimmed" AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" AIO_NUM_THREADS="$cores_per_worker" numactl -C "$cpu_range" python3 "$exe_name" "$i" &
done

# Check if workers immediately crash
sleep 2
retry_count=0
max_retries=5
sleep_interval=1

while [ $retry_count -lt $max_retries ]; do
    worker_processes=$(ps aux | grep "waiWorker.py" | grep -v "grep")
    num_workers_alive=$(echo "$worker_processes" | wc -l)
    
    if [ $num_workers_alive -eq $num_workers ]; then
        break
    fi
    ((retry_count++))
    sleep $sleep_interval
done

if [ $num_workers_alive -eq $num_workers ]; then
    log "INFO - ${script_name} started ${num_workers} waiWorkers" 3
elif [ $num_workers_alive -gt $num_workers ]; then
    log "WARNING - ${script_name} expected ${num_workers} waiWorkers but found ${num_workers_alive} waiWorkers" 3
    log "INFO - Worker processes:\n${worker_processes}" 3
    log "INFO - ${script_name} started ${num_workers} waiWorkers" 3
else
    log "ERROR - ${script_name} expected ${num_workers} waiWorkers but found only ${num_workers_alive} waiWorkers" 3
    log "INFO - Worker processes:\n${worker_processes}" 3
    log "INFO - ${script_name} state: start up failed" 3
    exit 1
fi

# Give more time for rabbitMQ to open listening ports after host boot up (BST-1111)
log "INFO - wait a while for rabbitMQ to become full functional" 3
sleep 20

# Launch Wai Server
python3 waiServer.py
if [ $? -eq 0 ]; then
    log "INFO - ${script_name} started waiServer" 3
    log "INFO - ${script_name} state: warm up" 3
    exit 0
else
    log "INFO - ${script_name} state: start up failed" 3
    exit 1
fi
