Fix to issue joshfraser#34 to account for index 0 of the array

aait · Oct 19, 2017 · 9c42602 · 9c42602
1 parent 87d0b55
commit 9c42602
Showing 1 changed file with 107 additions and 51 deletions.
diff --git a/parser.php b/parser.php
@@ -8,6 +8,8 @@
  *   - given/first name
  *   - middle name/initial(s)
  *   - surname (last name)
+ *   - surname base (last name without compounds)
+ *   - surname compounds (only the compounds)
  *   - suffix (II, PhD, Jr. etc)
  *
  * Author: Josh Fraser
@@ -99,7 +101,7 @@ class FullNameParser {
       'Doc.' => array('associate professor'),
       ' ' => array('the')
     ),
-    'compound' => array('da','de','del','della','der','di','du','la','pietro','st.','st','ter','van','vanden','vere','von'),
+    'compound' => array('da','de','del','della', 'dem', 'den', 'der','di','du', 'het', 'la', 'onder', 'op', 'pietro','st.','st','\'t', 'ten', 'ter','van','vanden','vere','von'),
     'suffixes' => array(
       'line' => array('I','II','III','IV','V','1st','2nd','3rd','4th','5th','Senior','Junior','Jr','Sr'),
       'prof' => array('AO', 'B.A.', 'M.Sc', 'BCompt', 'PhD', 'Ph.D.','APR','RPh','PE','MD', 'M.D.', 'MA','DMD','CME', 'BSc', 'Bsc', 'BSc(hons)', 'Ph.D.', 'BEng', 'M.B.A.', 'MBA', 'FAICD', 'CM', 'OBC', 'M.B.', 'ChB', 'FRCP', 'FRSC',
@@ -108,7 +110,7 @@ class FullNameParser {
           'QC', 'Q.C.', 'M.Tech', 'CTA', 'C.I.M.A.', 'B.Ec',
           'CFIA','ICCP','CPS','CAP-OM','CAPTA','TNAOAP','AFA','AVA','ASA','CAIA','CBA','CVA','ICVS','CIIA','CMU','PFM','PRM','CFP','CWM','CCP','EA','CCMT','CGAP','CDFM','CFO','CGFM','CGAT','CGFO','CMFO','CPFO','CPFA',
           'BMD','BIET','P.Eng','PE', 'MBBS', 'MB', 'BCh', 'BAO', 'BMBS', 'MBBChir', 'MBChBa','MPhil','LL.D', 'LLD', 'D.Lit','DEA', 'DESS', 'DClinPsy', 'DSc', 'MRes', 'M.Res', 'Psy.D', 'Pharm.D',
-          'BA(Admin)', 'BAcc','BACom','BAdmin','BAE','BAEcon','BA(Ed)','BA(FS)','BAgr','BAH','BAI','BAI(Elect)','BAI(Mech)','BALaw','BAO','BAppSc','BArch','BArchSc','BARelSt','BASc','BASoc', 'D.D.S.',
+          'BA(Admin)', 'BAcc','BACom','BAdmin','BAE','BAEcon','BA(Ed)','BA(FS)','BAgr','BAH','BAI','BAI(Elect)','BAI(Mech)','BALaw','BAO','BAppSc','BArch','BArchSc','BARelSt','BASc','BASoc', 'DDS', 'D.D.S.',
           'BASS','BATheol','BBA','BBLS', 'BBS','BBus','BChem','BCJ','BCL','BCLD(SocSc)','BClinSci','BCom','BCombSt','BCommEdCommDev','BComp','BComSc','BCoun','BD','BDes','BE','BEcon','BEcon&Fin', 'M.P.P.M.', 'MPPM',
           'BEconSci', 'BEd','BEng','BES','BEng(Tech)','BFA','BFin','BFLS','BFST','BH','BHealthSc','BHSc','BHy','BJur','BL','BLE','BLegSc','BLib','BLing','BLitt','BLittCelt','BLS','BMedSc','BMet',
           'BMid', 'BMin','BMS','BMSc','BMSc','BMS','BMus','BMusEd','BMusPerf','BN', 'BNS','BNurs','BOptom','BPA','BPharm', 'BPhil', 'TTC', 'DIP', 'Tchg', 'BEd', 'MEd','ACIB', 'FCIM', 'FCIS', 'FCS', 'Fcs',
@@ -123,6 +125,18 @@ class FullNameParser {
   protected $not_nicknames = array( "(hons)");
 
 
+  /**
+   * Parse Static entry point.
+   *
+   * @param string $name the full name you wish to parse
+   * @return array returns associative array of name parts
+   */
+  public static function parse($name) {
+    $parser = new self();
+    return $parser->parse_name($name);
+  }
+
+
   /**
    * This is the primary method which calls all other methods
    *
@@ -139,15 +153,15 @@ public function parse_name($full_name) {
     // $full_name = str_replace("(hons)", '', $full_name );
 
     # Setup default vars
-    extract(array('salutation' => '', 'fname' => '', 'initials' => '', 'lname' => '', 'suffix' => ''));
+    extract(array('salutation' => '', 'fname' => '', 'initials' => '', 'lname' => '', 'lname_base' => '', 'lname_compound' => '', 'suffix' => ''));
 
     # Find all the professional suffixes possible
     $professional_suffix = $this->get_pro_suffix($full_name);
 
-    // The position of the first professional suffix denotes then end of the name and the start of the suffixes
-    $first_suffix_index = strlen($full_name);
+    // The position of the first professional suffix denotes the end of the name and the start of suffixes
+    $first_suffix_index = mb_strlen($full_name);
     foreach ($professional_suffix as $key => $psx) {
-      $start = strpos($full_name, $psx);
+      $start = mb_strpos($full_name, $psx);
       if( $start === FALSE ) {
         echo "ASSERT ERROR, the professional suffix:".$psx." cannot be found in the full name:".$full_name."<br>";
         continue;
@@ -158,36 +172,36 @@ public function parse_name($full_name) {
     }
 
     // everything to the right of the first professional suffix is part of the suffix
-    $suffix = substr($full_name, $first_suffix_index);
+    $suffix = mb_substr($full_name, $first_suffix_index);
 
     // remove the suffixes from the full_name
-    $full_name = substr($full_name, 0, $first_suffix_index);
+    $full_name = mb_substr($full_name, 0, $first_suffix_index);
 
     # Deal with nickname, push to array
     $has_nick = $this->get_nickname($full_name);
     if ($has_nick) {
       # Remove wrapper chars from around nickname
-      $name['nickname'] = substr($has_nick, 1, (strlen($has_nick) - 2));
+      $name['nickname'] = mb_substr($has_nick, 1, (mb_strlen($has_nick) - 2));
       # Remove the nickname from the full name
       $full_name = str_replace($has_nick, '', $full_name);
       # Get rid of consecutive spaces left by the removal
       $full_name = str_replace('  ', ' ', $full_name);
     }
-
+    
     # Grab a list of words from the remainder of the full name
     $unfiltered_name_parts = $this->break_words($full_name);
 
     # Is first word a title or multiple titles consecutively?
     if( count($unfiltered_name_parts)) {
       // only start looking if there are any words left in the name to process
-      while ($s = $this->is_salutation($unfiltered_name_parts[0])) {
+      while (count($unfiltered_name_parts) > 0 && $s = $this->is_salutation($unfiltered_name_parts[0])) {
         $salutation .= "$s ";
         array_shift($unfiltered_name_parts);
       }
       $salutation = trim($salutation);
-        // Find if there is a line suffix, if so then move it out
+      // Find if there is a line suffix, if so then move it out
       # Is last word a suffix or multiple suffixes consecutively?
-      while ($s = $this->is_line_suffix($unfiltered_name_parts[count($unfiltered_name_parts)-1], $full_name)) {
+      while (count($unfiltered_name_parts) > 0 && $s = $this->is_line_suffix($unfiltered_name_parts[count($unfiltered_name_parts)-1], $full_name)) {
         if( $suffix != "") {
           $suffix = $s.", ".$suffix;
         } else {
@@ -200,23 +214,24 @@ public function parse_name($full_name) {
       $salutation = "";
       $suffix = "";
     }
-    
+
     // Re-pack the unfiltered name parts array and exclude empty words
     $name_arr = array();
     foreach ($unfiltered_name_parts as $key => $name_part) {
       $name_part = trim($name_part);
-      if(strlen($name_part) == '1') {
+      $name_part = rtrim($name_part,',');
+      if(mb_strlen($name_part) == '1') {
         // If any word left is of one character that is not alphabetic then it is not a real word, so remove it
-        if( ! ctype_alpha($name_part)) {
+        if( ! $this->mb_ctype_alpha($name_part)) {
           $name_part = "";
         }
       }
-      if( strlen(trim($name_part)) ) {
+      if(mb_strlen(trim($name_part)) ) {
         $name_arr[] = $name_part;
       }
     }
     $unfiltered_name_parts = $name_arr;
-  
+
     # set the ending range after prefix/suffix trim
     $end = count($unfiltered_name_parts);
 
@@ -237,15 +252,15 @@ public function parse_name($full_name) {
           # for ex: "R. Jason Smith" => "Jason Smith" & "R." is stored as an initial
           # but "R. J. Smith" => "R. Smith" and "J." is stored as an initial
           if ($this->is_initial($unfiltered_name_parts[$i+1])) {
-            $fname .= " ".strtoupper($word);
+            $fname .= " ".mb_strtoupper($word);
           }
           else {
-            $initials .= " ".strtoupper($word);
+            $initials .= " ".mb_strtoupper($word);
           }
         }
         # otherwise, just go ahead and save the initial
         else {
-          $initials .= " ".strtoupper($word);
+          $initials .= " ".mb_strtoupper($word);
         }
       }
       else {
@@ -256,8 +271,13 @@ public function parse_name($full_name) {
     if( count($unfiltered_name_parts)) {
       # check that we have more than 1 word in our string
       if ($end-0 > 1) {
-        # concat the last name
+        # concat the last name and split last name in base and compound
         for ($i; $i < $end; $i++) {
+          if ($this->is_compound($unfiltered_name_parts[$i])) {
+            $lname_compound .= " ".$unfiltered_name_parts[$i];
+          } else {
+            $lname_base .= " ".$this->fix_case($unfiltered_name_parts[$i]);
+          }
           $lname .= " ".$this->fix_case($unfiltered_name_parts[$i]);
         }
       }
@@ -274,6 +294,8 @@ public function parse_name($full_name) {
     $name['fname'] = trim($fname);
     $name['initials'] = trim($initials);
     $name['lname'] = trim($lname);
+    $name['lname_base'] = trim($lname_base);
+    $name['lname_compound'] = trim($lname_compound);
     $name['suffix'] = $suffix;
     return $name;
   }
@@ -306,16 +328,14 @@ public function break_words($name) {
    * @return mixed returns the suffix if exists, false otherwise
    */
   public function get_pro_suffix($name) {
-    
+
     $found_suffix_arr = array();
     foreach ($this->dict['suffixes']['prof'] as $suffix) {
-      if (preg_match("/,[\s]*$suffix\b/i", $name, $matches)) {
+      if (preg_match('/[,\s]+'.preg_quote($suffix).'\b/i', $name, $matches)) {
         $found_suffix = trim($matches[0]);
         $found_suffix = rtrim($found_suffix,',');
         $found_suffix = ltrim($found_suffix,',');
         $found_suffix_arr[] = trim($found_suffix);
-      } else if( strpos($name, $suffix) !== FALSE ) {
-         $found_suffix_arr[] = $suffix;
       }
     }
     return $found_suffix_arr;
@@ -337,7 +357,7 @@ public function get_pro_suffix($name) {
    */
   protected function get_nickname($name) {
     if (preg_match("/[\(|\"].*?[\)|\"]/", $name, $matches)) {
-      if( ! in_array( strtolower($matches[0]), $this->not_nicknames ) ) {
+      if( ! in_array( mb_strtolower($matches[0]), $this->not_nicknames ) ) {
         return $matches[0];
       } else {
         return false;
@@ -358,11 +378,11 @@ protected function get_nickname($name) {
   protected function is_line_suffix($word, $name) {
 
     # Ignore periods and righ commas, normalize case
-    $word = str_replace('.', '', strtolower($word));
+    $word = str_replace('.', '', mb_strtolower($word));
     $word = rtrim($word,',');
 
     # Search the array for our word
-    $line_match = array_search($word, array_map('strtolower', $this->dict['suffixes']['line']));
+    $line_match = array_search($word, array_map('mb_strtolower', $this->dict['suffixes']['line']));
 
     # Now test our edge cases based on lineage
     if ($line_match !== false) {
@@ -378,7 +398,7 @@ protected function is_line_suffix($word, $name) {
 
         # If name is Joshua Senior, it's pretty likely that Senior is the surname
         # However, if the name is Joshua Jones Senior, then it's likely a suffix
-        if (str_word_count($name) < 3) {
+        if ($this->mb_str_word_count($name) < 3) {
           return false;
         }
 
@@ -404,7 +424,7 @@ protected function is_line_suffix($word, $name) {
    * @return boolean
    */
   protected function is_salutation($word) {
-    $word = str_replace('.', '', strtolower($word));
+    $word = str_replace('.', '', mb_strtolower($word));
     foreach ($this->dict['prefix'] as $replace => $originals) {
       if (in_array($word, $originals)) {
         return $replace;
@@ -422,7 +442,7 @@ protected function is_salutation($word) {
    * @return boolean
    */
   protected function is_compound($word) {
-    return array_search(strtolower($word), $this->dict['compound']);
+    return in_array(mb_strtolower($word), $this->dict['compound']);
   }
 
 
@@ -434,7 +454,7 @@ protected function is_compound($word) {
    * @return boolean
    */
   protected function is_initial($word) {
-    return ((strlen($word) == 1) || (strlen($word) == 2 && $word{1} == "."));
+    return ((mb_strlen($word) == 1) || (mb_strlen($word) == 2 && $word{1} == "."));
   }
 
 
@@ -446,7 +466,7 @@ protected function is_initial($word) {
    * @return boolean
    */
   protected function is_camel_case($word) {
-    if (preg_match("/[A-Za-z]([A-Z]*[a-z][a-z]*[A-Z]|[a-z]*[A-Z][A-Z]*[a-z])[A-Za-z]*/", $word)) {
+    if (preg_match('/\p{L}(\p{Lu}*\p{Ll}\p{Ll}*\p{Lu}|\p{Ll}*\p{Lu}\p{Lu}*\p{Ll})\p{L}*/', $word)) {
       return true;
     }
     return false;
@@ -457,43 +477,43 @@ protected function is_camel_case($word) {
   public function fix_case($word) {
 
     # Fix case for words split by periods (J.P.)
-    if (strpos($word, '.') !== false) {
+    if (mb_strpos($word, '.') !== false) {
       $word = $this->safe_ucfirst(".", $word);;
     }
 
     # Fix case for words split by hyphens (Kimura-Fay)
-    if (strpos($word, '-') !== false) {
+    if (mb_strpos($word, '-') !== false) {
       $word = $this->safe_ucfirst("-", $word);
     }
 
     # Special case for single letters
-    if (strlen($word) == 1) {
-      $word = strtoupper($word);
+    if (mb_strlen($word) == 1) {
+      $word = mb_strtoupper($word);
     }
 
     # Special case for 2-letter words
-    if (strlen($word) == 2) {
+    if (mb_strlen($word) == 2) {
       # Both letters vowels (uppercase both)
-      if (in_array(strtolower($word{0}), $this->dict['vowels']) && in_array(strtolower($word{1}), $this->dict['vowels'])) {
-        $word = strtoupper($word);
+      if (in_array(mb_strtolower($word{0}), $this->dict['vowels']) && in_array(mb_strtolower($word{1}), $this->dict['vowels'])) {
+        $word = mb_strtoupper($word);
       }
       # Both letters consonants (uppercase both)
-      if (!in_array(strtolower($word{0}), $this->dict['vowels']) && !in_array(strtolower($word{1}), $this->dict['vowels'])) {
-        $word = strtoupper($word);
+      if (!in_array(mb_strtolower($word{0}), $this->dict['vowels']) && !in_array(mb_strtolower($word{1}), $this->dict['vowels'])) {
+        $word = mb_strtoupper($word);
       }
       # First letter is vowel, second letter consonant (uppercase first)
-      if (in_array(strtolower($word{0}), $this->dict['vowels']) && !in_array(strtolower($word{1}), $this->dict['vowels'])) {
-        $word = ucfirst(strtolower($word));
+      if (in_array(mb_strtolower($word{0}), $this->dict['vowels']) && !in_array(mb_strtolower($word{1}), $this->dict['vowels'])) {
+        $word = $this->mb_ucfirst(mb_strtolower($word));
       }
       # First letter consonant, second letter vowel or "y" (uppercase first)
-      if (!in_array(strtolower($word{0}), $this->dict['vowels']) && (in_array(strtolower($word{1}), $this->dict['vowels']) || strtolower($word{1}) == 'y')) {
-        $word = ucfirst(strtolower($word));
+      if (!in_array(mb_strtolower($word{0}), $this->dict['vowels']) && (in_array(mb_strtolower($word{1}), $this->dict['vowels']) || mb_strtolower($word{1}) == 'y')) {
+        $word = $this->mb_ucfirst(mb_strtolower($word));
       }
     }
 
-    # Fix case for words which aren't initials, but are all upercase or lowercase
-    if ( (strlen($word) >= 3) && (ctype_upper($word) || ctype_lower($word)) ) {
-      $word = ucfirst(strtolower($word));
+    # Fix case for words which aren't initials, but are all uppercase or lowercase
+    if ( (mb_strlen($word) >= 3) && ($this->mb_ctype_upper($word) || $this->mb_ctype_lower($word)) ) {
+      $word = $this->mb_ucfirst(mb_strtolower($word));
     }
 
     return $word;
@@ -504,9 +524,45 @@ public function safe_ucfirst($seperator, $word) {
     # uppercase words split by the seperator (ex. dashes or periods)
     $parts = explode($seperator, $word);
     foreach ($parts as $word) {
-      $words[] = ($this->is_camel_case($word)) ? $word : ucfirst(strtolower($word));
+      $words[] = ($this->is_camel_case($word)) ? $word : $this->mb_ucfirst(mb_strtolower($word));
     }
     return implode($seperator, $words);
   }
 
+    # helper public function for multibytes ctype_alpha
+    public function mb_ctype_alpha($text)
+    {
+      return (bool)preg_match('/^\p{L}*$/', $text);
+    }
+
+    # helper public function for multibytes ctype_lower
+    public function mb_ctype_lower($text)
+    {
+      return (bool)preg_match('/^\p{Ll}*$/', $text);
+    }
+
+    # helper public function for multibytes ctype_upper
+    public function mb_ctype_upper($text)
+    {
+      return (bool)preg_match('/^\p{Lu}*$/', $text);
+    }
+
+    # helper public function for multibytes str_word_count
+    public function mb_str_word_count($text)
+    {
+      if (empty($text)) {
+        return 0;
+      } else {
+        return preg_match('/s+/', $text) + 1;
+      }
+    }
+
+    # helper public function for multibytes ucfirst
+    public function mb_ucfirst($string)
+    {
+      $strlen = mb_strlen($string);
+      $firstChar = mb_substr($string, 0, 1);
+      $then = mb_substr($string, 1, $strlen - 1);
+      return mb_strtoupper($firstChar) . $then;
+    }
 }