#!/usr/bin/env python

"""Implementation of general algorithms for information retrieval.

Relevant definitions:
term frequency       the number of times a term appears in a document.
document frequency   the number of documents (from a collection) that
                     term appears in.
collection frequency the total number of times that a term appears
                     in a collection.
N   the number of documents in the collection
tf  short for term frequency
df  short for document frequency
cf  short for collection frequency


Functions:
log_tf         Calculate the logarithm term frequency.
augmented_tf   Calculate the augmented term frequency.
hersh_tf       Calculate Bill Hersh's term frequency.
log_idf        Calculate the logarithm inverse doc frequency.
hersh_idf      Calculate Bill Hersh's inverse doc frequency.
residual_idf   Calculate the residual inverse doc frequency.

p_kmix         Calculate probability from a k-mixture distribution.
est_kmix       Estimate the parameters for a k-mixture distribution.
veccos_score   Calculate the vector cosine score of two vectors.


References:
Foundations of Statistical Natural Language Processing.
Christopher D. Manning, Hinrich Schutze
The MIT Press, Cambridge, MA 1999

An Analysis of Statistical Term Strength and Its Use in the Indexing
and Retrieval of Molecular Biology Texts
W. John Wilbur and Yiming Yang
Comput. Biol. Med. Vol 26, No. 3, pp. 209-222, 1996

Information Retrieval, A Health Care Perspective
William R. Hersh
1996 Springer-Verlag, New York

"""

# XXX change order of idf parameters

import math
import Numeric

def log_tf(tf):
    """log_tf(tf) -> score

    Calculate the logarithm term frequency.
    
    log_tf = 1 + log_2(tf)
    
    """
    if not tf:
        return 0.0
    return 1.0+_log2(tf)

def augmented_tf(tf, max_tf):
    """augmented_tf(tf, max_tf) -> score

    Calculated the augmented term frequency.  max_tf is the
    maximum term frequency of any term in the document.

    augmented_tf = 0.5 + 0.5(tf / max_tf)
    
    """
    if not tf:  # if tf > 0.0, then max_tf > 0.0
        return 0.0
    return 0.5+0.5*tf/max_tf

def hersh_tf(tf):
    """hersh_tf(tf) -> score

    Calculate the Hersh term frequency.
    
    log_tf = 1 + log_10(tf)
    
    """
    if not tf:
        return 0.0
    return 1.0+math.log10(tf)

def log_idf(N, df):
    """log_idf(N, df) -> score

    Calculate the logarithm inverse doc frequency.  N is the number
    of documents in the corpus.

    log_idf = log_2(N/df)
    
    """
    if not df:
        return 0.0
    return _log2(float(N)/df)

def hersh_idf(N, df):
    """hersh_idf(N, df) -> score

    Calculate the Hersh inverse doc frequency.
    idf = log_10(N/df) + 1
    
    """
    if not df:
        return 0.0
    return math.log10(float(N)/df)+1.0

def residual_idf(N, df, cf):
    """residual_idf(N, df, cf) -> score

    Calculate the residual inverse doc frequency.

    """
    if not df:  # if df > 0, then N > 0 and cf > 0
        return 0.0
    # IDF = log_2(N/df)
    # lambda = cf/N
    # P(k; lambda) = e^(-lambda) * lambda^k / k!
    # residual_idf = IDF + log_2(1-P(0; lambda))
    #              = log_2(N/df) + log_2(1-e^(-lambda))
    #              = log_2(N / df) + log_2(1-e^(-cf/N))
    #              = log_2(N / df * (1-e^(-cf/N)))
    # In the errata of Statistical NLP at:
    # http://www.sultry.arts.usyd.edu.au/fsnlp/errata.html

    idf = float(N)/df
    p = math.exp(-float(cf)/N)
    return _log2(idf*(1.0-p))

def veccos_score(vec1, vec2):
    """veccos_score(vec1, vec2) -> score

    Given two vectors, calculate their similarity score based on a
    vector cosine algorithm.  vec1 and vec2 are Numeric.array's of
    the same length.

    """
    vec1, vec2 = Numeric.array(vec1), Numeric.array(vec2)
    len_vec1 = math.sqrt(Numeric.sum(vec1*vec1))
    len_vec2 = math.sqrt(Numeric.sum(vec2*vec2))
    try:
        score = Numeric.sum(vec1*vec2) / (len_vec1*len_vec2)
    except ZeroDivisionError:
        return 0.0
    except OverflowError:
        return 0.0
    return score

def p_kmix(k, alpha, beta):
    """p_kmix(k, alpha, beta) -> probability of k

    Calculate the probability of a k-mixture for parameters
    alpha and beta.

    """
    if k == 0:
        p = (1.0-alpha) + alpha/(beta+1.0)
    else:
        p = alpha/(beta+1.0) * (beta/(beta+1.0))**k
    return p

def est_kmix(N, df, cf):
    """est_kmix(N, df, cf) -> (alpha, beta)

    Estimate the parameters of a k-mixture.

    """
    lam = float(cf) / N
    beta = float(cf - df)/df
    alpha = lam / beta

    return alpha, beta

LOG2 = math.log(2.0)
def _log2(x):
    """_log2(x) -> log base 2 of x"""
    return math.log(x)/LOG2

