Go to the documentation of this file.00001 <!--
00002 _ _______ _
00003 | | |__ __\ | |
00004 _ __ | |_ __ | | ___ ___ | |___
00005 | '_ \| | '_ \| |/ _ \ / _ \| / __|
00006 | | | | | |_) | | (_) | (_) | \__ \
00007 |_| |_|_| .__/|_|\___/ \___/|_|___/
00008 ___________________________| |_________________________________________
00009 | |_| |\
00010 | |_\
00011 | File : WhitespaceTok.php |
00012 | Created : 16-Feb-2012 |
00013 | By : atrilla |
00014 | |
00015 | nlpTools - Natural Language Processing Toolkit for PHP |
00016 | |
00017 | Copyright (c) 2012 Alexandre Trilla |
00018 | |
00019 | ___________________________________________________________________ |
00020 | |
00021 | This file is part of nlpTools. |
00022 | |
00023 | nlpTools is free software: you can redistribute it and/or modify |
00024 | it under the terms of the MIT/X11 License as published by the |
00025 | Massachusetts Institute of Technology. See the MIT/X11 License |
00026 | for more details. |
00027 | |
00028 | You should have received a copy of the MIT/X11 License along with |
00029 | this source code distribution of nlpTools (see the COPYING file |
00030 | in the root directory). If not, see |
00031 | <http:
00032 |_________________________________________________________________________|
00033 -->
00034
00035 <?php
00036
00037 require(dirname(__FILE__)."/Tokeniser.php");
00038
00047 class WhitespaceTok implements Tokeniser {
00048
00052 public function tokenise(&$text) {
00053 $text = (string)$text;
00054 $text = trim($text);
00055 $text = preg_replace("/[^a-zA-Z0-9\ ]/", " $0 ", $text);
00056 $text = preg_replace("/\ \ +/", " ", $text);
00057 $words = explode(" ", $text);
00058 return $words;
00059 }
00060 }
00061
00062 ?>