# This is a quick implementation of a test to detect the hispanic origin # of a given last name, as described in Technical Working Paper No. 13 (part # 7.1.3, Orthographic Structure of Surnames) of the Population Division of the # U.S. Census Bureau. # # # Copyright (c) 2007 Jordi S. Bunster # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. class String def hispanic? # We first have to "normalize" the name, so we make it lowercase and # replace characters beyond ASCII with the nearest sound. lastname = self.downcase accent_replacement_map = { ['á','à','â','ä','Ä','Â','À','Á','ã','Ã'] => 'a', ['é','è','ê','ë','Ë','Ê','È','É'] => 'e', ['í','ì','î','ï','Ï','Î','Ì','Í'] => 'i', ['ó','ò','ô','ö','Ö','Ô','Ò','Ó','õ','Õ'] => 'o', ['ú','ù','û','ü','Ü','Û','Ù','Ú'] => 'u', ['ñ','Ñ'] => 'n', ['ç','Ç'] => 'c', } accent_replacement_map.each do |acccent_family, replacement| acccent_family.each { |accent| lastname.gsub!(accent, replacement) } end # We start with some common known exceptions that fool the test: COMMON_EXCEPTIONS.each do |exception| return true if lastname == exception end # If lastname (sub)contains a single one of these, it is not hispanic. INVALID_SUBSTRINGS.each do |sub_string| return false if lastname.include? sub_string end # If the last two letters of lastname are 'pi', the next test is moot. unless lastname.slice(-2, 2) == 'pi' # The last three letters of lastname *must* be in this array return false unless VALID_STARTS.include? lastname.slice(-3, 3) end # If the last two letters of lastname are 'pi', the next test is moot. unless lastname.slice(0, 2) == 'pi' # The first three letters of lastname *must* be in this array return false unless VALID_ENDS.include? lastname.slice(0, 3) end true # since we've gotten so far end ##### BEWARE: Ugly large arrays from now on, and nothing more. :) COMMON_EXCEPTIONS = [ 'saavedra', 'cotto', 'jasso', 'delossantos' ] INVALID_SUBSTRINGS = [ 'w','k','tt','nn','aa','mm','bb','cc','dd','ee', 'ff','gg','hh','ii','jj','oo','pp','qq','ss','uu','vv','xx','yy','zz' ] VALID_STARTS = [ 'aal','aba','abe','abi','abo','abu','aca','ace','ach', 'aci','aco','ada','ade','adi','ado','ady','ael','aen','aes','aex','aez', 'afa','afe','aga','ago','agu','aig','ain','aio','ais','ait','aiz','aja', 'ajo','ala','ald','ale','ali','all','alo','als','alt','alu','aly','alz', 'ama','ame','amo','amp','ana','and','ane','ani','ano','ans','ant','any', 'anz','aos','apa','ape','apo','ara','ard','are','ari','aro','arr','ars', 'art','ary','arz','asa','ash','asi','aso','asp','ass','asu','ata','ate', 'ath','ati','ato','ats','att','atu','aua','aud','aue','aul','aun','aur', 'aus','aut','aux','auz','ava','ave','avo','aya','ayo','aza','azo','azu', 'bad','bal','ban','bao','bar','bas','bat','bau','bay','baz','bea','bel', 'ben','beo','ber','bes','bet','beu','bey','bez','bia','bie','bil','bin', 'bio','bis','biz','bla','blo','boa','bol','bon','boo','bor','bos','bot', 'boy','boz','bra','bre','bro','bua','buz','cal','can','car','cas','cay', 'caz','cca','cea','ced','cel','cen','ceo','cer','ces','cez','cha','che', 'cho','chy','chz','cia','cid','cil','cin','cio','cis','ciz','coa','com', 'con','cor','cos','cot','coy','coz','cre','cri','cto','cua','cud','cue', 'cui','cun','cur','cut','cuy','cuz','dad','dal','dan','dar','das','day', 'daz','dea','dei','del','deo','der','des','det','deu','dey','dez','dia', 'die','dil','din','dio','dip','dis','dit','diz','dlo','dma','doa','doc', 'dol','don','dor','dos','dot','doy','doz','dra','dre','dro','dua','due', 'dui','duy','duz','eal','ean','ear','eas','eaz','eba','ebo','eca','ech', 'eco','eda','ede','edo','ega','ego','egu','eia','ein','eis','eja','ejo', 'ela','eli','ell','elo','ely','ema','eme','emo','emy','ena','ene','eng', 'eni','eno','ens','ent','eny','enz','eon','eos','eoz','epa','epe','epo', 'era','ern','ero','err','ers','ert','eru','erz','esa','ese','esi','eso', 'ess','esu','eta','ete','eto','ets','ety','eus','eut','euz','eva','eve', 'evo','eya','eyo','eza','ezo','ezu','fan','far','fas','fat','fau','fay', 'faz','feo','fex','fez','ffa','ffi','ffo','fil','fin','fio','fiz','foa', 'fos','fox','foz','fra','fre','fuz','gal','gan','gar','gas','gat','gau', 'gay','gaz','gdo','gea','gel','gen','ger','ges','get','gez','gia','gil', 'gin','gio','giz','gle','goa','gol','gon','gor','gos','got','goy','goz', 'gra','gre','gro','gua','gue','gui','gul','gun','guo','gur','gus','guy', 'guz','han','har','has','haz','hea','hel','heo','her','hes','het','heu', 'hez','hia','hil','hin','hio','his','hiz','hoa','hon','hor','hos','hot', 'hoz','hua','hue','hui','huz','ial','ian','iar','ias','iay','iaz','iba', 'ibe','ibu','ica','ice','ico','ida','ide','idi','ido','iel','ien','ier', 'ies','iez','ifa','ife','iga','igo','igu','ija','ijo','ila','ile','ill', 'ilo','ils','ilu','ily','ima','ime','imo','ina','ine','ini','ino','ins', 'inz','iol','ion','ios','ioz','ipa','ipe','ipi','ipo','ira','ire','iri', 'iro','iru','isa','iso','isu','ita','ite','iti','ito','its','itu','ity', 'itz','ius','iuz','iva','ive','ivo','ixa','iya','iye','iza','izo','izu', 'jal','jam','jan','jar','jas','jaz','jea','jel','jen','jer','jes','jez', 'jia','jil','jio','jiz','joa','jol','jon','joo','jos','joz','jua','jul', 'lan','lao','lar','las','lat','lau','lay','laz','lba','lbe','lbi','lbo', 'lca','lce','lco','lda','lde','ldi','ldo','ldu','lea','led','leg','lem', 'len','leo','ler','les','let','lex','lez','lfo','lga','lgo','lia','lid', 'lil','lin','lio','lis','liu','lix','liz','lja','ljo','lla','lle','lli', 'llo','lls','lma','lme','lmo','lna','loa','lom','lon','lor','los','lot', 'lou','loy','loz','lpa','lpi','lsa','lso','lta','lto','lua','lum','luo', 'luz','lva','lve','lvi','lvo','lza','lzo','lzu','mal','man','mao','mar', 'mas','mat','mau','may','maz','mba','mbo','mea','mei','mel','men','meo', 'mer','mes','meu','mey','mez','mia','mil','min','mio','mir','mis','miz', 'mlo','moa','mol','mon','mor','mos','mot','moz','mpa','mpo','mua','mus', 'muy','muz','nal','nan','nao','nar','nar','nas','nat','nau','naz','nca', 'nce','nco','nda','nde','ndi','ndo','ndz','nea','nel','neo','ner','nes', 'net','nex','ney','nez','nfo','nga','nge','ngo','nia','nil','nin','nio', 'nis','niz','nja','nje','njo','nna','nne','noa','nod','nol','non','nor', 'nos','noz','npa','nsa','nso','nta','nte','nto','ntu','nty','nua','nue', 'nur','nus','nuz','nva','nza','nzo','nzu','oal','oas','oaz','oba','obe', 'obo','oca','oce','och','oco','oda','odo','odz','oea','oel','oen','oes', 'oey','oez','oga','ogo','oig','oin','ois','oix','oiz','oja','ojo','ola', 'oll','olo','ols','oma','ome','omo','oms','ona','one','ong','oni','ono', 'ons','ont','opa','ope','opo','ora','ori','oro','ors','ort','ory','osa', 'ose','oso','ota','ote','oto','oud','oul','ova','ove','ovi','ovo','oya', 'oyo','oza','ozo','ozu','pal','pan','par','pas','paz','per','pes','pez', 'pia','pin','pio','pis','piz','pla','poa','pol','pon','pos','pou','poy', 'poz','ppa','pro','pua','pus','puz','qua','que','qui','rad','rai','ral', 'ran','rao','rar','ras','rat','rau','ray','raz','rba','rbe','rbo','rca', 'rce','rch','rco','rda','rde','rdi','rdo','rdu','rea','red','rel','ren', 'reo','rer','res','ret','reu','rev','rew','rex','rey','rez','rfa','rfi', 'rga','rge','rgi','rgo','ria','rib','rid','rie','rig','ril','rim','rin', 'rio','ris','rit','riu','riz','rja','rjo','rla','rlo','rma','rme','rmo', 'rna','rne','rni','rno','rns','rnu','roa','rol','ron','roo','ros','roy', 'roz','rpa','rpi','rra','rre','rri','rro','rru','rry','rrz','rsa','rse', 'rso','rta','rte','rth','rti','rto','rtu','rty','rua','rue','rui','rur', 'rus','ruz','rva','rvo','rza','rze','rzo','rzu','saa','sad','sal','sar', 'sas','saz','sca','sch','sco','sea','sel','sen','seo','ser','ses','set', 'sez','sga','sgo','sia','sil','sin','sio','sis','siz','sla','sle','sma', 'sme','sno','soa','sol','son','sor','sos','soz','spe','spi','spo','ssa', 'ssi','sso','ssy','sta','ste','sti','sto','stu','sty','sua','sud','sul', 'sun','sus','suz','sva','tad','tal','tan','tao','tar','tas','tau','tay', 'taz','tea','tel','ten','teo','ter','tes','teu','tey','tez','tga','tia', 'tie','til','tin','tio','tir','tis','tiz','toa','tol','ton','tor','tos', 'tot','tou','toy','toz','tra','tre','tro','try','tta','tte','tto','tts', 'tty','tua','tud','tun','tur','tus','tuz','ual','uan','uar','uas','uay', 'uba','ube','ubi','uca','uce','uch','uco','uda','udo','uea','ued','uel', 'uen','ueo','uer','ues','uet','uez','ufe','ufo','uga','ugo','uia','uig', 'uil','uin','uio','uis','uit','uiu','uiz','uja','ujo','ula','ule','uli', 'ulo','uls','uma','ume','una','uno','unz','uon','uoz','upe','upo','ura', 'ure','uri','uro','urt','uru','ury','urz','usa','uso','uta','ute','uto', 'uva','uxo','uya','uyo','uza','uze','uzo','uzu','val','van','var','vas', 'vat','vay','vaz','vdo','vea','vel','ven','veo','ver','ves','vet','vez', 'via','vid','vie','vil','vin','vio','vis','viz','vjo','vle','voa','vol', 'von','vor','vos','voz','vua','vuz','xar','xas','xia','xta','xto','yan', 'yar','yas','yaz','yba','yco','yda','yde','yea','yen','yer','yes','yet', 'yez','yja','ymi','yna','yni','yno','yoa','yol','yon','yor','yos','you', 'yra','yre','yro','yta'