Go to the documentation of this file.00001 <!--
00002 _ _______ _
00003 | | |__ __\ | |
00004 _ __ | |_ __ | | ___ ___ | |___
00005 | '_ \| | '_ \| |/ _ \ / _ \| / __|
00006 | | | | | |_) | | (_) | (_) | \__ \
00007 |_| |_|_| .__/|_|\___/ \___/|_|___/
00008 ___________________________| |_________________________________________
00009 | |_| |\
00010 | |_\
00011 | File : MultinomialNaiveBayes.php |
00012 | Created : 16-Feb-2012 |
00013 | By : atrilla |
00014 | |
00015 | nlpTools - Natural Language Processing Toolkit for PHP |
00016 | |
00017 | Copyright (c) 2012 Alexandre Trilla |
00018 | |
00019 | ___________________________________________________________________ |
00020 | |
00021 | This file is part of nlpTools. |
00022 | |
00023 | nlpTools is free software: you can redistribute it and/or modify |
00024 | it under the terms of the MIT/X11 License as published by the |
00025 | Massachusetts Institute of Technology. See the MIT/X11 License |
00026 | for more details. |
00027 | |
00028 | You should have received a copy of the MIT/X11 License along with |
00029 | this source code distribution of nlpTools (see the COPYING file |
00030 | in the root directory). If not, see |
00031 | <http:
00032 |_________________________________________________________________________|
00033 -->
00034
00035 <?php
00036
00037 require(dirname(__FILE__)."/Classifier.php");
00038
00039 include_once("DB.php");
00040 include_once(dirname(__FILE__)."/../util/dbauth/DBAuthManager.php");
00041 include_once(dirname(__FILE__)."/../tokenisation/WhitespaceTok.php");
00042
00064 class MultinomialNaiveBayes implements Classifier {
00065
00069 private $thePrior;
00070
00074 private $theCondProb;
00075
00079 private $theDBName;
00080
00084 private $theTokeniser;
00085
00089 const tabPriorName = "prior";
00090
00094 const tabCondName = "cond";
00095
00101 public function __construct() {
00102 $this->theTokeniser = new WhitespaceTok;
00103 }
00104
00108 public function train(array &$dataTrain, array &$dataLabel) {
00109 if (count($dataTrain) != count($dataLabel)) {
00110 throw new Exception("MultinomialNaiveBayes classifier: ".
00111 "different numbers of labelled instances are given!\n");
00112 }
00113 $catVocabCount =
00114 $this->extractCatVocabCounts($dataTrain, $dataLabel);
00115 $total = 0;
00116 $catCount = array();
00117 $vocabulary = array();
00118 foreach($catVocabCount as $cat => $termCounts) {
00119 $catCount[$cat] = 0;
00120 foreach($termCounts as $term => $count) {
00121
00122 $catCount[$cat] += $count;
00123 $vocabulary[$term] = 1;
00124 }
00125 $total += $catCount[$cat];
00126 }
00127 $vocabSize = count($vocabulary);
00128 foreach($catCount as $cat => &$cCount) {
00129 foreach($catVocabCount[$cat] as $term => &$tCount) {
00130 $tCount = ($tCount + 1) / ($cCount + $vocabSize);
00131 }
00132
00133
00134 $catVocabCount[$cat][Classifier::OOV] = 1 /
00135 ($cCount + $vocabSize);
00136 $cCount /= $total;
00137 }
00138 $this->thePrior = &$catCount;
00139 $this->theCondProb = &$catVocabCount;
00140 }
00141
00153 private function extractCatVocabCounts(array &$text, array &$lab) {
00154 $catVoc = array();
00155 foreach(range(0, count($text) - 1) as $ind) {
00156 if (!array_key_exists($lab[$ind], $catVoc)) {
00157 $catVoc[$lab[$ind]] = array();
00158 $workingProb = &$catVoc[$lab[$ind]];
00159 } else {
00160 $workingProb = &$catVoc[$lab[$ind]];
00161 }
00162
00163 $words = $this->theTokeniser->tokenise($text[$ind]);
00164 foreach($words as $word) {
00165 if (!array_key_exists($word, $workingProb)) {
00166 $workingProb[$word] = 1;
00167 } else {
00168 $workingProb[$word] += 1;
00169 }
00170 }
00171 }
00172 return $catVoc;
00173 }
00174
00178 public function setDatabase($name) {
00179 $name = (string)$name;
00180 if (getPrefixID() != "") {
00181 $this->theDBName = getPrefixID()."_".$name;
00182 } else {
00183 $this->theDBName = $name;
00184 }
00185 }
00186
00190 public function save() {
00191 $db = DB::connect(getConnection());
00192 if (DB::isError($db)) {
00193 throw Exception("MultinomialNaiveBayes classifier: ".
00194 "DBMS connection error! ".$db->getMessage()."\n");
00195 } else {
00196 $db->query("CREATE DATABASE ".$this->theDBName.";");
00197
00198 assert(!is_null($this->thePrior) &&
00199 !is_null($this->theCondProb));
00200 $db->query("USE ".$this->theDBName.";");
00201 $db->query("CREATE TABLE ".
00202 MultinomialNaiveBayes::tabPriorName.
00203 " ( Category varchar(255), Probability Double );");
00204 foreach($this->thePrior as $cat => $prob) {
00205 $db->query("INSERT INTO ".
00206 MultinomialNaiveBayes::tabPriorName.
00207 " VALUES ( '$cat', $prob );");
00208 }
00209
00210
00211
00212 foreach($this->theCondProb as $cat => $termProb) {
00213 $db->query("CREATE TABLE ".
00214 MultinomialNaiveBayes::tabCondName."_$cat".
00215 " ( Term varchar(255), Probability Double )");
00216 foreach($termProb as $term => $prob) {
00217 $db->query("INSERT INTO ".
00218 MultinomialNaiveBayes::tabCondName."_$cat".
00219 " VALUES ( '$term', $prob );");
00220 }
00221 }
00222 $db->disconnect();
00223 }
00224 }
00225
00231 public function classify($dataTest) {
00232 $db = DB::connect(getConnection().$this->theDBName);
00233 if (DB::isError($db)) {
00234 throw new Exception("MultinomialNaiveBayes classifier: ".
00235 "DB connection error! ".$db->getMessage()."\n");
00236 } else {
00237 $dataTest = (string)$dataTest;
00238 assert(!is_null($db->query("SHOW TABLES;")));
00239 $decision = "indeterminate";
00240 $probDecision = 0;
00241 $words = $this->theTokeniser->tokenise($dataTest);
00242 $catLabel = $db->getCol("SELECT Category FROM ".
00243 MultinomialNaiveBayes::tabPriorName.";");
00244 foreach($catLabel as $cat) {
00245 $prior = $db->getOne("SELECT Probability FROM ".
00246 MultinomialNaiveBayes::tabPriorName.
00247 " WHERE Category = '$cat';");
00248 $score = log($prior);
00249 foreach($words as $word) {
00250 $itExists = $db->getOne("SELECT Probability FROM ".
00251 MultinomialNaiveBayes::tabCondName."_$cat".
00252 " WHERE Term = '$word';");
00253 if (!is_null($itExists)) {
00254 $score += log($itExists);
00255 } else {
00256 $oovProb = $db->getOne("SELECT Probability ".
00257 "FROM ".
00258 MultinomialNaiveBayes::tabCondName."_$cat".
00259 " WHERE Term = '".Classifier::OOV."';");
00260 $score += log($oovProb);
00261 }
00262 }
00263 if ($decision == "indeterminate") {
00264 $decision = $cat;
00265 $probDecision = $score;
00266 } elseif ($score > $probDecision) {
00267 $decision = $cat;
00268 $probDecision = $score;
00269 }
00270 }
00271 return $decision;
00272 }
00273 }
00274 }
00275
00276 ?>