# File raramorph.rb, line 81 def self.tokenize(str) #String , REturn String str.force_encoding "UTF-8" str = str.strip str = str.gsub(@@space_regex, " ") #ignored \u0688 : ARABIC LETTER DDAL #ignored \u06A9 : ARABIC LETTER KEHEH #ignored \u0691 : ARABIC LETTER RREH #ignored \u06BA : ARABIC LETTER NOON GHUNNA #ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE #ignored \u06C1 : ARABIC LETTER HEH GOAL #ignored \u06D2 : ARABIC LETTER YEH BARREE split = str.split(/[^\u067E\u0686\u0698\u06AF\u0621-\u0636\u0637-\u0643\u0644\u0645-\u0648\u0649-\u064A\u064B-\u064E\u064F\u0650\u0651\u0652]+/) tokens = [] #return at least one token, the string if necessary split.length == 0 ? (tokens << str) : split end