Dataset.php

Go to the documentation of this file.
00001 <!--
00002                            _    _______          _     
00003                           | |  |__   __\        | |    
00004                      _ __ | |_ __ | | ___   ___ | |___ 
00005                     | '_ \| | '_ \| |/ _ \ / _ \| / __|
00006                     | | | | | |_) | | (_) | (_) | \__ \
00007                     |_| |_|_| .__/|_|\___/ \___/|_|___/
00008  ___________________________| |_________________________________________
00009 |                           |_|                                        |\
00010 |                                                                      |_\
00011 |   File    : Dataset.php                                                 |
00012 |   Created : 16-Feb-2012                                                 |
00013 |   By      : atrilla                                                     |
00014 |                                                                         |
00015 |   nlpTools - Natural Language Processing Toolkit for PHP                |
00016 |                                                                         |
00017 |   Copyright (c) 2012 Alexandre Trilla                                   |
00018 |                                                                         |
00019 |   ___________________________________________________________________   |
00020 |                                                                         |
00021 |   This file is part of nlpTools.                                        |
00022 |                                                                         |
00023 |   nlpTools is free software: you can redistribute it and/or modify      |
00024 |   it under the terms of the MIT/X11 License as published by the         |
00025 |   Massachusetts Institute of Technology. See the MIT/X11 License        |
00026 |   for more details.                                                     |
00027 |                                                                         |
00028 |   You should have received a copy of the MIT/X11 License along with     |
00029 |   this source code distribution of nlpTools (see the COPYING file       |
00030 |   in the root directory). If not, see                                   |
00031 |   <http://www.opensource.org/licenses/mit-license>.                     |
00032 |_________________________________________________________________________|
00033 -->
00034 
00035 <?php
00036 
00037 require(dirname(__FILE__)."/Feeder.php");
00038 
00039 include_once(dirname(__FILE__)."/../../tokenisation/WhitespaceTok.php");
00040 
00047 class Dataset implements Feeder {
00048 
00056     public function getFood($sourceURL) {
00057         $text = array();
00058         $labs = array();
00059         $sourceURL = (string)$sourceURL;
00060         $sourceURL = realpath($sourceURL);
00061         if (!is_file($sourceURL)) {
00062             throw new Exception("Dataset feeder: source location ".
00063                 "unknown!\n");
00064         } else {
00065             $contents = file_get_contents($sourceURL);
00066             $instance = explode("\n", $contents);
00067             // there's a blank line at the end
00068             array_pop($instance);
00069             $tokeniser = new WhitespaceTok;
00070             foreach($instance as $inst) {
00071                 $words = $tokeniser->tokenise($inst);
00072                 $labs[] = array_pop($words);
00073                 $text[] = implode(" ", $words);
00074             }
00075         }
00076         return array($text, $labs);
00077     }
00078 }
00079 
00080 ?>