#!/usr/local/python2.7/bin/python
import numpy
import sys
import commands
import pickle
import random
import svmutil

def get_id_mass(f_string):
    ids = []; mass = []
    f = open(f_string); count = 0
    while True:
        line = f.readline()
        if not line:
            break
        words = line.split()
        k = words[0]; m = float(words[1])
        ids.append(k); mass.append(m)
        count = count +1
    ids = numpy.array(ids).reshape((count,1)); mass = numpy.array(mass).reshape((count,1))
    return ids,mass

def get_query_mass(test_spectra):
    mass = []
    for spec in test_spectra:
        mass.append(spec.mass)
    return mass

def sort_candis(pred_fp,candis,n_fp,n_candi,FP_DB,train_acc,data_fp_ind,fp_sel):
    if n_candi == 1:
        return [(candis[0],1)]
    tmp_candis = [] # some candis are only in mass db but not in fp_db
    for i in range(n_candi):
        candi = candis[i]
        if candi not in FP_DB: # if not in FP_DB, set its score to -inf            scor
            continue
        tmp_candis.append(candi)
    candis = tmp_candis
    n_candi = len(candis)

    temp_matrix = numpy.zeros((n_candi,n_fp)); scores = numpy.zeros(n_candi)
    res = [] # the result list of (ids, scores)
    #construct temp matrix of fps for candis
    for i in range(n_candi):
        candi = candis[i]
        db_fp = FP_DB[candi]
        db_fp = numpy.array(map(int,[db_fp[j] for j in range(len(db_fp))])) # 528 bit string
        db_fp = db_fp[map(int,data_fp_ind-1)] #
        db_fp = db_fp[fp_sel] # how to choose a subset based on fp_sel ???"
        temp_matrix[i,:] = db_fp

    # print "construct db_fp (for candidates) matrix done"
    # get the score for all candis
    # smooth train_acc to avoid -inf in computing the score
    train_acc[train_acc==1] = 0.99999 
    pred_fp_rep = numpy.tile(pred_fp,(n_candi,1)); train_acc_rep = numpy.tile(train_acc,(n_candi,1))
    p = numpy.sum(numpy.log(train_acc_rep*(pred_fp_rep==temp_matrix)+(1-train_acc_rep)*(pred_fp_rep!=temp_matrix)),1)

    #print "get the score for every candidates"
    candis = numpy.array(candis)
    scores = p
    # sort scores
    sorted_candis = candis[numpy.argsort(scores)]
    #print "sort the candidates done"
    # same scores should have the same ranking
    sorted_scores = numpy.sort(scores)

    # normalize scores to [0,1]
    sorted_scores = sorted_scores + abs(sorted_scores[0]) + 1
    max_score = float(max(sorted_scores))
    sorted_scores = (sorted_scores+max_score/n_candi/n_candi) / (max_score + max_score/n_candi)
    #sorted_scores = 1 + sorted_scores/np.sqrt(np.sum(sorted_scores**2))
    for i in range(len(sorted_scores)-1,-1,-1): # large score go first
        res.append((sorted_candis[i],sorted_scores[i]))
    return res

def search(MASS_DB,FP_DB,test_spectra,pred_fps,ppm):
    #print "loading MASS and FP database to search ... "
    ID_ARRAY, MASS_ARRAY = get_id_mass(MASS_DB)
    f = open(FP_DB,"rb")
    FP_DB = pickle.load(f)
    f.close()

    res_fp_file = "util/train_acc.txt"
    data_fp_ind = numpy.fromfile("util/used_fp_index",sep=" ")

    n_fp = len(data_fp_ind)
    fp_sel = range(n_fp) # a set of fp are used, after fp selection can be used
    ###########################################################################################
    query_mass = get_query_mass(test_spectra); n_query = len(query_mass)
    train_acc = numpy.fromfile(res_fp_file,sep=" ")
    #pred_fps = numpy.fromfile(pred_fp_file,sep=" ").reshape(n_query,n_fp)
    all_res = []
    ##############################################################################################
    #print "search begin ... "
    for i in range(n_query): # begin search for each query mass spectrum # 
        #print "the %s th query mass spectra begin" % str(i)
        # get candidates after mass filtering
        tolerant = float(ppm)/1000000*query_mass[i]
        candis = numpy.array(ID_ARRAY[abs(query_mass[i] - MASS_ARRAY) < tolerant])

        #print "after mass filtering, get the set of candidates (%d)" % len(candis)
        pred_fp = pred_fps[i,:]; n_candi = len(candis)
        if n_candi == 0:
            #error_page("Warning: Set larger ppm to get more candidates",1)
            all_res.append([("None",-1)])
            continue
        one_res = sort_candis(pred_fp,candis,n_fp,n_candi,FP_DB,train_acc,data_fp_ind,fp_sel)
        #print "sort candidates done"
        all_res.append(one_res)
    return all_res # return the best guess for each query mass spectrum
#search()


