MultinomialNaiveBayes.php

Go to the documentation of this file.
00001 <!--
00002                            _    _______          _     
00003                           | |  |__   __\        | |    
00004                      _ __ | |_ __ | | ___   ___ | |___ 
00005                     | '_ \| | '_ \| |/ _ \ / _ \| / __|
00006                     | | | | | |_) | | (_) | (_) | \__ \
00007                     |_| |_|_| .__/|_|\___/ \___/|_|___/
00008  ___________________________| |_________________________________________
00009 |                           |_|                                        |\
00010 |                                                                      |_\
00011 |   File    : MultinomialNaiveBayes.php                                   |
00012 |   Created : 16-Feb-2012                                                 |
00013 |   By      : atrilla                                                     |
00014 |                                                                         |
00015 |   nlpTools - Natural Language Processing Toolkit for PHP                |
00016 |                                                                         |
00017 |   Copyright (c) 2012 Alexandre Trilla                                   |
00018 |                                                                         |
00019 |   ___________________________________________________________________   |
00020 |                                                                         |
00021 |   This file is part of nlpTools.                                        |
00022 |                                                                         |
00023 |   nlpTools is free software: you can redistribute it and/or modify      |
00024 |   it under the terms of the MIT/X11 License as published by the         |
00025 |   Massachusetts Institute of Technology. See the MIT/X11 License        |
00026 |   for more details.                                                     |
00027 |                                                                         |
00028 |   You should have received a copy of the MIT/X11 License along with     |
00029 |   this source code distribution of nlpTools (see the COPYING file       |
00030 |   in the root directory). If not, see                                   |
00031 |   <http://www.opensource.org/licenses/mit-license>.                     |
00032 |_________________________________________________________________________|
00033 -->
00034 
00035 <?php
00036 
00037 require(dirname(__FILE__)."/Classifier.php");
00038 
00039 include_once("DB.php");
00040 include_once(dirname(__FILE__)."/../util/dbauth/DBAuthManager.php");
00041 include_once(dirname(__FILE__)."/../tokenisation/WhitespaceTok.php");
00042 
00064 class MultinomialNaiveBayes implements Classifier {
00065 
00069     private $thePrior;
00070 
00074     private $theCondProb;
00075 
00079     private $theDBName;
00080 
00084     private $theTokeniser;
00085 
00089     const tabPriorName = "prior";
00090 
00094     const tabCondName = "cond";
00095 
00101     public function __construct() {
00102         $this->theTokeniser = new WhitespaceTok;
00103     }
00104 
00108     public function train(array &$dataTrain, array &$dataLabel) {
00109         if (count($dataTrain) != count($dataLabel)) {
00110             throw new Exception("MultinomialNaiveBayes classifier: ".
00111                 "different numbers of labelled instances are given!\n");
00112         }
00113         $catVocabCount = 
00114             $this->extractCatVocabCounts($dataTrain, $dataLabel);
00115         $total = 0;
00116         $catCount = array();
00117         $vocabulary = array();
00118         foreach($catVocabCount as $cat => $termCounts) {
00119             $catCount[$cat] = 0;
00120             foreach($termCounts as $term => $count) {
00121                 // Total sum of term freqs in the corpus
00122                 $catCount[$cat] += $count;
00123                 $vocabulary[$term] = 1;
00124             }
00125             $total += $catCount[$cat];
00126         }
00127         $vocabSize = count($vocabulary);
00128         foreach($catCount as $cat => &$cCount) {
00129             foreach($catVocabCount[$cat] as $term => &$tCount) {
00130                 $tCount = ($tCount + 1) / ($cCount + $vocabSize);
00131             }
00132             // For all the rest of terms in the vocabulary...
00133             // which are OOV wrt the given category...
00134             $catVocabCount[$cat][Classifier::OOV] = 1 / 
00135                 ($cCount + $vocabSize);
00136             $cCount /= $total;
00137         }
00138         $this->thePrior = &$catCount;
00139         $this->theCondProb = &$catVocabCount;
00140     }
00141 
00153     private function extractCatVocabCounts(array &$text, array &$lab) {
00154         $catVoc = array();
00155         foreach(range(0, count($text) - 1) as $ind) {
00156             if (!array_key_exists($lab[$ind], $catVoc)) {
00157                 $catVoc[$lab[$ind]] = array();
00158                 $workingProb = &$catVoc[$lab[$ind]];
00159             } else {
00160                 $workingProb = &$catVoc[$lab[$ind]];
00161             }
00162             // parse text with space char
00163             $words = $this->theTokeniser->tokenise($text[$ind]);
00164             foreach($words as $word) {
00165                 if (!array_key_exists($word, $workingProb)) {
00166                     $workingProb[$word] = 1;
00167                 } else {
00168                     $workingProb[$word] += 1;
00169                 }
00170             }
00171         }
00172         return $catVoc;
00173     }
00174 
00178     public function setDatabase($name) {
00179         $name = (string)$name;
00180         if (getPrefixID() != "") {
00181             $this->theDBName = getPrefixID()."_".$name;
00182         } else {
00183             $this->theDBName = $name;
00184         }
00185     }
00186 
00190     public function save() {
00191         $db = DB::connect(getConnection());
00192         if (DB::isError($db)) {
00193             throw Exception("MultinomialNaiveBayes classifier: ".
00194                 "DBMS connection error! ".$db->getMessage()."\n");
00195         } else {
00196             $db->query("CREATE DATABASE ".$this->theDBName.";");
00197             // Trained classifier
00198             assert(!is_null($this->thePrior) && 
00199                 !is_null($this->theCondProb));
00200             $db->query("USE ".$this->theDBName.";");
00201             $db->query("CREATE TABLE ".
00202                 MultinomialNaiveBayes::tabPriorName.
00203                 " ( Category varchar(255), Probability Double );");
00204             foreach($this->thePrior as $cat => $prob) {
00205                 $db->query("INSERT INTO ".
00206                     MultinomialNaiveBayes::tabPriorName.
00207                     " VALUES ( '$cat', $prob );");
00208             }
00209             // By using one table for each category, the amount
00210             // of cat-wise terms is reduced drastically due to
00211             // sparseness issues.
00212             foreach($this->theCondProb as $cat => $termProb) {
00213                 $db->query("CREATE TABLE ".
00214                     MultinomialNaiveBayes::tabCondName."_$cat".
00215                     " ( Term varchar(255), Probability Double )");
00216                 foreach($termProb as $term => $prob) {
00217                     $db->query("INSERT INTO ".
00218                         MultinomialNaiveBayes::tabCondName."_$cat".
00219                         " VALUES ( '$term', $prob );");
00220                 }
00221             }
00222             $db->disconnect();
00223         }
00224     }
00225 
00231     public function classify($dataTest) {
00232         $db = DB::connect(getConnection().$this->theDBName);
00233         if (DB::isError($db)) {
00234             throw new Exception("MultinomialNaiveBayes classifier: ".
00235                 "DB connection error! ".$db->getMessage()."\n");
00236         } else {
00237             $dataTest = (string)$dataTest;
00238             assert(!is_null($db->query("SHOW TABLES;")));
00239             $decision = "indeterminate";
00240             $probDecision = 0;
00241             $words = $this->theTokeniser->tokenise($dataTest);
00242             $catLabel = $db->getCol("SELECT Category FROM ".
00243                 MultinomialNaiveBayes::tabPriorName.";");
00244             foreach($catLabel as $cat) {
00245                 $prior = $db->getOne("SELECT Probability FROM ".
00246                     MultinomialNaiveBayes::tabPriorName.
00247                     " WHERE Category = '$cat';");
00248                 $score = log($prior);
00249                 foreach($words as $word) {
00250                     $itExists = $db->getOne("SELECT Probability FROM ".
00251                         MultinomialNaiveBayes::tabCondName."_$cat".
00252                         " WHERE Term = '$word';");
00253                     if (!is_null($itExists)) {
00254                         $score += log($itExists);
00255                     } else {
00256                         $oovProb = $db->getOne("SELECT Probability ".
00257                             "FROM ".
00258                             MultinomialNaiveBayes::tabCondName."_$cat".
00259                             " WHERE Term = '".Classifier::OOV."';");
00260                         $score += log($oovProb);
00261                     }
00262                 }
00263                 if ($decision == "indeterminate") {
00264                     $decision = $cat;
00265                     $probDecision = $score;
00266                 } elseif ($score > $probDecision) {
00267                     $decision = $cat;
00268                     $probDecision = $score;
00269                 }
00270             }
00271             return $decision;
00272         }
00273     }
00274 }
00275 
00276 ?>