# This is a quick implementation of a test to detect the hispanic origin # of a given last name, as described in Technical Working Paper No. 13 (part # 7.1.3, Orthographic Structure of Surnames) of the Population Division of the # U.S. Census Bureau. # # # Copyright (c) 2007 Jordi S. Bunster # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. class String def hispanic? # We first have to "normalize" the name, so we make it lowercase and # replace characters beyond ASCII with the nearest sound. lastname = self.downcase accent_replacement_map = { ['á','à','â','ä','Ä','Â','À','Á','ã','Ã'] => 'a', ['é','è','ê','ë','Ë','Ê','È','É'] => 'e', ['í','ì','î','ï','Ï','Î','Ì','Í'] => 'i', ['ó','ò','ô','ö','Ö','Ô','Ò','Ó','õ','Õ'] => 'o', ['ú','ù','û','ü','Ü','Û','Ù','Ú'] => 'u', ['ñ','Ñ'] => 'n', ['ç','Ç'] => 'c', } accent_replacement_map.each do |acccent_family, replacement| acccent_family.each { |accent| lastname.gsub!(accent, replacement) } end # We start with some common known exceptions that fool the test: COMMON_EXCEPTIONS.each do |exception| return true if lastname == exception end # If lastname (sub)contains a single one of these, it is not hispanic. INVALID_SUBSTRINGS.each do |sub_string| return false if lastname.include? sub_string end # If the last two letters of lastname are 'pi', the next test is moot. unless lastname.slice(-2, 2) == 'pi' # The last three letters of lastname *must* be in this array return false unless VALID_STARTS.include? lastname.slice(-3, 3) end # If the last two letters of lastname are 'pi', the next test is moot. unless lastname.slice(0, 2) == 'pi' # The first three letters of lastname *must* be in this array return false unless VALID_ENDS.include? lastname.slice(0, 3) end true # since we've gotten so far end ##### BEWARE: Ugly large arrays from now on, and nothing more. :) COMMON_EXCEPTIONS = [ 'saavedra', 'cotto', 'jasso', 'delossantos' ] INVALID_SUBSTRINGS = [ 'w','k','tt','nn','aa','mm','bb','cc','dd','ee', 'ff','gg','hh','ii','jj','oo','pp','qq','ss','uu','vv','xx','yy','zz' ] VALID_STARTS = [ 'aal','aba','abe','abi','abo','abu','aca','ace','ach', 'aci','aco','ada','ade','adi','ado','ady','ael','aen','aes','aex','aez', 'afa','afe','aga','ago','agu','aig','ain','aio','ais','ait','aiz','aja', 'ajo','ala','ald','ale','ali','all','alo','als','alt','alu','aly','alz', 'ama','ame','amo','amp','ana','and','ane','ani','ano','ans','ant','any', 'anz','aos','apa','ape','apo','ara','ard','are','ari','aro','arr','ars', 'art','ary','arz','asa','ash','asi','aso','asp','ass','asu','ata','ate', 'ath','ati','ato','ats','att','atu','aua','aud','aue','aul','aun','aur', 'aus','aut','aux','auz','ava','ave','avo','aya','ayo','aza','azo','azu', 'bad','bal','ban','bao','bar','bas','bat','bau','bay','baz','bea','bel', 'ben','beo','ber','bes','bet','beu','bey','bez','bia','bie','bil','bin', 'bio','bis','biz','bla','blo','boa','bol','bon','boo','bor','bos','bot', 'boy','boz','bra','bre','bro','bua','buz','cal','can','car','cas','cay', 'caz','cca','cea','ced','cel','cen','ceo','cer','ces','cez','cha','che', 'cho','chy','chz','cia','cid','cil','cin','cio','cis','ciz','coa','com', 'con','cor','cos','cot','coy','coz','cre','cri','cto','cua','cud','cue', 'cui','cun','cur','cut','cuy','cuz','dad','dal','dan','dar','das','day', 'daz','dea','dei','del','deo','der','des','det','deu','dey','dez','dia', 'die','dil','din','dio','dip','dis','dit','diz','dlo','dma','doa','doc', 'dol','don','dor','dos','dot','doy','doz','dra','dre','dro','dua','due', 'dui','duy','duz','eal','ean','ear','eas','eaz','eba','ebo','eca','ech', 'eco','eda','ede','edo','ega','ego','egu','eia','ein','eis','eja','ejo', 'ela','eli','ell','elo','ely','ema','eme','emo','emy','ena','ene','eng', 'eni','eno','ens','ent','eny','enz','eon','eos','eoz','epa','epe','epo', 'era','ern','ero','err','ers','ert','eru','erz','esa','ese','esi','eso', 'ess','esu','eta','ete','eto','ets','ety','eus','eut','euz','eva','eve', 'evo','eya','eyo','eza','ezo','ezu','fan','far','fas','fat','fau','fay', 'faz','feo','fex','fez','ffa','ffi','ffo','fil','fin','fio','fiz','foa', 'fos','fox','foz','fra','fre','fuz','gal','gan','gar','gas','gat','gau', 'gay','gaz','gdo','gea','gel','gen','ger','ges','get','gez','gia','gil', 'gin','gio','giz','gle','goa','gol','gon','gor','gos','got','goy','goz', 'gra','gre','gro','gua','gue','gui','gul','gun','guo','gur','gus','guy', 'guz','han','har','has','haz','hea','hel','heo','her','hes','het','heu', 'hez','hia','hil','hin','hio','his','hiz','hoa','hon','hor','hos','hot', 'hoz','hua','hue','hui','huz','ial','ian','iar','ias','iay','iaz','iba', 'ibe','ibu','ica','ice','ico','ida','ide','idi','ido','iel','ien','ier', 'ies','iez','ifa','ife','iga','igo','igu','ija','ijo','ila','ile','ill', 'ilo','ils','ilu','ily','ima','ime','imo','ina','ine','ini','ino','ins', 'inz','iol','ion','ios','ioz','ipa','ipe','ipi','ipo','ira','ire','iri', 'iro','iru','isa','iso','isu','ita','ite','iti','ito','its','itu','ity', 'itz','ius','iuz','iva','ive','ivo','ixa','iya','iye','iza','izo','izu', 'jal','jam','jan','jar','jas','jaz','jea','jel','jen','jer','jes','jez', 'jia','jil','jio','jiz','joa','jol','jon','joo','jos','joz','jua','jul', 'lan','lao','lar','las','lat','lau','lay','laz','lba','lbe','lbi','lbo', 'lca','lce','lco','lda','lde','ldi','ldo','ldu','lea','led','leg','lem', 'len','leo','ler','les','let','lex','lez','lfo','lga','lgo','lia','lid', 'lil','lin','lio','lis','liu','lix','liz','lja','ljo','lla','lle','lli', 'llo','lls','lma','lme','lmo','lna','loa','lom','lon','lor','los','lot', 'lou','loy','loz','lpa','lpi','lsa','lso','lta','lto','lua','lum','luo', 'luz','lva','lve','lvi','lvo','lza','lzo','lzu','mal','man','mao','mar', 'mas','mat','mau','may','maz','mba','mbo','mea','mei','mel','men','meo', 'mer','mes','meu','mey','mez','mia','mil','min','mio','mir','mis','miz', 'mlo','moa','mol','mon','mor','mos','mot','moz','mpa','mpo','mua','mus', 'muy','muz','nal','nan','nao','nar','nar','nas','nat','nau','naz','nca', 'nce','nco','nda','nde','ndi','ndo','ndz','nea','nel','neo','ner','nes', 'net','nex','ney','nez','nfo','nga','nge','ngo','nia','nil','nin','nio', 'nis','niz','nja','nje','njo','nna','nne','noa','nod','nol','non','nor', 'nos','noz','npa','nsa','nso','nta','nte','nto','ntu','nty','nua','nue', 'nur','nus','nuz','nva','nza','nzo','nzu','oal','oas','oaz','oba','obe', 'obo','oca','oce','och','oco','oda','odo','odz','oea','oel','oen','oes', 'oey','oez','oga','ogo','oig','oin','ois','oix','oiz','oja','ojo','ola', 'oll','olo','ols','oma','ome','omo','oms','ona','one','ong','oni','ono', 'ons','ont','opa','ope','opo','ora','ori','oro','ors','ort','ory','osa', 'ose','oso','ota','ote','oto','oud','oul','ova','ove','ovi','ovo','oya', 'oyo','oza','ozo','ozu','pal','pan','par','pas','paz','per','pes','pez', 'pia','pin','pio','pis','piz','pla','poa','pol','pon','pos','pou','poy', 'poz','ppa','pro','pua','pus','puz','qua','que','qui','rad','rai','ral', 'ran','rao','rar','ras','rat','rau','ray','raz','rba','rbe','rbo','rca', 'rce','rch','rco','rda','rde','rdi','rdo','rdu','rea','red','rel','ren', 'reo','rer','res','ret','reu','rev','rew','rex','rey','rez','rfa','rfi', 'rga','rge','rgi','rgo','ria','rib','rid','rie','rig','ril','rim','rin', 'rio','ris','rit','riu','riz','rja','rjo','rla','rlo','rma','rme','rmo', 'rna','rne','rni','rno','rns','rnu','roa','rol','ron','roo','ros','roy', 'roz','rpa','rpi','rra','rre','rri','rro','rru','rry','rrz','rsa','rse', 'rso','rta','rte','rth','rti','rto','rtu','rty','rua','rue','rui','rur', 'rus','ruz','rva','rvo','rza','rze','rzo','rzu','saa','sad','sal','sar', 'sas','saz','sca','sch','sco','sea','sel','sen','seo','ser','ses','set', 'sez','sga','sgo','sia','sil','sin','sio','sis','siz','sla','sle','sma', 'sme','sno','soa','sol','son','sor','sos','soz','spe','spi','spo','ssa', 'ssi','sso','ssy','sta','ste','sti','sto','stu','sty','sua','sud','sul', 'sun','sus','suz','sva','tad','tal','tan','tao','tar','tas','tau','tay', 'taz','tea','tel','ten','teo','ter','tes','teu','tey','tez','tga','tia', 'tie','til','tin','tio','tir','tis','tiz','toa','tol','ton','tor','tos', 'tot','tou','toy','toz','tra','tre','tro','try','tta','tte','tto','tts', 'tty','tua','tud','tun','tur','tus','tuz','ual','uan','uar','uas','uay', 'uba','ube','ubi','uca','uce','uch','uco','uda','udo','uea','ued','uel', 'uen','ueo','uer','ues','uet','uez','ufe','ufo','uga','ugo','uia','uig', 'uil','uin','uio','uis','uit','uiu','uiz','uja','ujo','ula','ule','uli', 'ulo','uls','uma','ume','una','uno','unz','uon','uoz','upe','upo','ura', 'ure','uri','uro','urt','uru','ury','urz','usa','uso','uta','ute','uto', 'uva','uxo','uya','uyo','uza','uze','uzo','uzu','val','van','var','vas', 'vat','vay','vaz','vdo','vea','vel','ven','veo','ver','ves','vet','vez', 'via','vid','vie','vil','vin','vio','vis','viz','vjo','vle','voa','vol', 'von','vor','vos','voz','vua','vuz','xar','xas','xia','xta','xto','yan', 'yar','yas','yaz','yba','yco','yda','yde','yea','yen','yer','yes','yet', 'yez','yja','ymi','yna','yni','yno','yoa','yol','yon','yor','yos','you', 'yra','yre','yro','yta','yti','yua','yud','yus','yva','yza','zal','zan', 'zar','zas','zca','zco','zea','zeo','zes','zez','zga','zia','zil','zin', 'zio','zma','zoa','zol','zon','zor','zos','zot','zoy','zoz','zpe','zpi', 'zta','zua','zud','zun','zur' ] VALID_ENDS = [ 'aba','abb','abd','abe','abi','abl','abo','abr','abu', 'aca','acc','ace','ach','aci','aco','acu','ada','ade','ado','adr','ads', 'adu','adv','aed','aez','afa','afr','aga','age','ago','agr','agu','agv', 'ahe','ahu','aib','aic','aig','ain','air','ais','ait','aiz','aja','aju', 'ala','alb','alc','ald','ale','alf','alg','ali','all','alm','alo','alp', 'alq','als','alt','alu','alv','alz','ama','amb','ame','ami','amo','amp', 'amu','ana','anc','and','ane','ang','ani','anj','ano','anq','ans','ant', 'anz','apa','ape','apo','apr','apu','aqu','ara','arb','arc','ard','are', 'arg','ari','arj','arl','arm','arn','aro','arp','arq','arr','ars','art', 'aru','arv','arz','asa','asc','ase','asi','asp','ass','ast','asu','ata', 'ate','ati','ato','atr','att','atu','auc','aud','auf','aui','aul','aum', 'aur','auz','ava','ave','avi','aya','ayb','ayc','aye','ayl','aym','ayo', 'ayu','aza','azc','aze','azi','azn','azo','azp','azu','bab','bac','bad', 'bae','bag','bah','bai','baj','bal','bam','ban','bap','baq','bar','bas', 'bat','bau','bav','bay','baz','bea','bec','bed','beg','bei','bej','bel', 'ben','beo','beq','ber','bes','bet','bex','bez','bia','bib','bic','bid', 'bie','big','bil','bin','bir','bis','bit','biv','bla','ble','blo','boa', 'bob','boc','bod','bof','bog','boh','boi','boj','bol','bom','bon','bor', 'bos','bot','bou','bov','boy','bra','bre','bri','bro','bru','bua','bub', 'buc','bue','buf','bug','bui','buj','bul','bur','bus','but','bux','buy', 'buz','caa','cab','cac','cad','cae','cag','cah','cai','caj','cal','cam', 'can','cao','cap','caq','car','cas','cat','cau','cav','cay','caz','cde', 'cea','ceb','cec','ced','ceg','cei','cej','cel','cen','cep','cer','ces', 'cet','cev','cey','cha','che','chi','cho','chu','cib','cic','cid','cie', 'cif','cig','cil','cim','cin','cio','cip','cir','cis','civ','cla','cle', 'cli','coa','cob','coc','cod','coe','cof','coi','col','com','con','cop', 'cor','cos','cot','cou','cov','coy','coz','cre','cri','cru','cua','cub', 'cuc','cue','cui','cul','cum','cun','cup','cur','cus','cut','cuv','cuy', 'cuz','dab','dag','dal','dam','dan','dao','dap','dar','das','dat','dau', 'dav','daz','dea','deb','dec','ded','dee','def','deg','deh','dei','dej', 'del','dem','den','deo','dep','deq','der','des','det','deu','dev','dey', 'dez','dia','die','dil','dim','dio','dip','dir','dis','dob','doc','doj', 'dol','dom','don','dop','dor','dos','dov','doz','dsp','dua','dub','duc', 'due','duh','dul','dum','duq','dur','ech','ede','edq','edr','ega','ege', 'egi','egl','egu','eir','eiz','elc','ele','elg','eli','elj','elo','elu', 'elv','ely','ema','emm','emp','ena','enc','end','ene','eng','enj','enr', 'equ','era','erc','erd','ere','eri','ero','err','esc','ese','esg','esl', 'esm','esn','esp','esq','est','etc','ete','eud','euf','eur','eus','euz', 'eva','evi','evo','exi','exp','eyl','eyz','eza','ezc','ezq','ezr','fab', 'fac','fag','fai','faj','fal','fam','fan','far','fas','fau','fav','fay', 'faz','feb','fed','fei','fel','fem','fen','feo','fer','fey','fia','fid', 'fie','fig','fil','fim','fio','fiq','fir','fiu','fla','fle','flo','foj', 'fol','fon','for','fos','foy','fra','fre','fri','fro','fru','fue','ful', 'fum','fun','fus','gab','gac','gad','gae','gaf','gag','gai','gaj','gal', 'gam','gan','gao','gar','gas','gat','gau','gav','gax','gay','gaz','gea', 'geb','gei','gel','gen','ger','ghi','gig','gij','gil','gim','gin','gir', 'gis','giz','glo','gob','goc','god','goe','goi','gol','gom','gon','gor', 'gos','got','gov','goy','goz','gra','gre','gri','gro','gru','gua','gud', 'gue','gui','gul','gum','gur','gus','gut','guz','hac','har','haz','hec', 'heg','hel','hen','her','hev','hey','hib','hid','hie','hig','hij','hil', 'hin','hip','hir','his','hit','hog','hol','hom','hon','hor','hos','hoy', 'hoz','hua','hue','hug','hui','hum','hur','hys','iba','ibe','ibi','ibo', 'ibu','ica','ice','ich','idi','ido','idr','iga','igl','ign','igo','igu', 'ila','ild','ilh','ili','ill','imp','ina','inc','ind','ine','inf','ing', 'ini','ino','ins','int','inz','ipi','iqu','ira','iri','irr','iru','irv', 'isa','ise','isi','isl','iso','isq','isu','ith','itu','iva','ixt','iza', 'izc','izn','izq','izu','jac','jai','jal','jan','jaq','jar','jas','jat', 'jau','jav','jay','jea','jem','jer','jes','jim','jin','jir','joa','jof', 'joj','jom','jor','jos','jov','joy','jua','jub','jue','juf','jul','jun', 'jur','jus','juv','lab','lac','lad','laf','lag','lah','lai','laj','lal', 'lam','lan','lao','lap','laq','lar','las','lat','lau','lav','lay','laz', 'lea','leb','lec','led','leg','lei','lej','lel','lem','len','leo','lep', 'ler','les','let','leu','lev','ley','lez','lia','lib','lic','lie','lig', 'lim','lin','liq','lir','lis','liz','lla','lle','lli','llo','llu','loa', 'lob','lod','loe','log','loi','loj','lom','lon','lop','loq','lor','los', 'lou','lov','loy','loz','lua','lub','luc','lud','lue','lug','lui','luj', 'lum','lun','lup','luq','luv','luy','luz','mac','mad','mae','mag','mai', 'maj','mal','mam','man','map','maq','mar','mas','mat','mau','may','maz', 'mea','mec','med','meg','mei','mej','mel','mem','men','mer','mes','met', 'mev','mex','mey','mez','mic','mie','mig','mij','mil','mim','min','miq', 'mir','mis','miy','moc','mod','mog','moh','moj','mol','mon','moq','mor', 'mos','mot','mou','moy','moz','muc','mud','mue','mug','mui','muj','mul', 'mun','mur','mus','mut','mux','muz','nab','nac','nad','naj','nal','nan', 'nap','nar','nat','nav','nay','naz','neb','nec','neg','nei','nej','nep', 'ner','nev','ney','nia','nic','nid','nie','nig','nil','nin','nir','nis', 'niv','niz','noa','nob','noc','nod','nog','nol','nom','nop','nor','nos', 'nov','noy','nua','nuc','nue','nun','oax','oba','obe','obi','obl','obr', 'obs','oca','oce','och','oco','oda','odi','odo','odr','ofa','ofe','ofr', 'oga','ogu','ohi','oja','oje','oji','ola','olb','old','ole','olg','oli', 'oll','olm','olo','olq','olt','olv','oma','oms','ona','ond','one','ong', 'onn','ono','ons','ont','opa','opi','opo','oqu','ora','orb','orc','ord', 'ore','orf','org','ori','orj','orn','oro','orp','orq','orr','ors','ort', 'oru','orv','orz','osa','osc','ose','osi','osl','osn','oso','osp','oss', 'ost','osu','ota','ote','oth','oti','oto','ova','ove','ovi','oxi','oya', 'oye','oyo','oyu','oza','oze','ozo','ozu','pab','pac','pad','pae','pag', 'pai','paj','pal','pam','pan','par','pas','pat','pau','pav','pay','paz', 'pec','ped','peg','pei','pel','pen','peo','pep','peq','per','pes','pey', 'pez','pia','pic','pie','pij','pil','pim','pin','pio','piq','pir','pis', 'pit','piz','pla','pli','plo','plu','pob','pod','poe','pol','pom','pon', 'poo','pop','por','pos','pot','pou','pov','poy','poz','pra','pre','pri', 'pro','pru','pub','puc','pue','pug','pui','puj','pul','pum','pun','pup', 'pur','puy','puz','qua','que','qui','rab','rad','rae','raf','rai','raj', 'ral','ram','ran','raq','ras','rau','rav','rax','ray','raz','rea','reb', 'rec','red','ref','reg','rei','rej','rel','rem','ren','reo','rep','req', 'res','ret','rev','rex','rey','rez','ria','rib','ric','rid','rie','rig', 'rij','rim','rin','rio','rip','riq','ris','riu','riv','riz','roa','rob', 'roc','rod','roe','rog','roh','roi','roj','rol','rom','ron','roq','ros', 'rot','rou','rov','rox','roy','roz','rua','rub','ruc','rud','rue','ruf', 'rug','rui','rul','rum','rup','rut','ruv','ruy','ruz','saa','sab','sac', 'sad','sae','saf','sag','sah','sai','sal','sam','san','sap','sar','sas', 'sat','sau','sav','say','sea','seb','sec','sed','seg','sei','sej','sel', 'sem','sen','seo','sep','seq','ser','ses','set','sev','sex','sez','sia', 'sib','sic','sid','sie','sif','sig','sil','sim','sin','sio','sip','siq', 'sir','sis','sit','siu','siv','six','sob','soc','sod','soj','sol','som', 'son','sop','soq','sor','sos','sot','sou','sov','soz','spi','sua','sub', 'suc','sud','sue','sui','sul','sum','sun','suq','sur','sus','sut','suz', 'tab','tac','tad','taf','tag','taj','tal','tam','tan','tap','tar','tas', 'tat','tav','tay','teb','teh','tei','tej','tel','tem','ten','tep','teq', 'ter','tes','tev','tex','tez','thi','tia','tib','tic','tie','tij','tin', 'tio','tiq','tir','tis','tix','tiz','tla','tob','toc','tof','tog','toi', 'toj','tol','tom','top','tor','tos','tov','toy','tra','tre','tri','tro', 'tru','tua','tub','tud','tue','tug','tul','tun','tur','tuy','uba','ube', 'ubi','uce','uch','uda','ude','ufr','uga','ugi','ugu','uju','ula','uli', 'ull','ult','uma','ump','una','und','une','ung','unz','ura','urb','urc', 'urd','ure','urg','uri','uro','urq','urr','urs','urt','uru','urv','urz', 'usa','usc','use','usi','uso','utr','uts','uva','uvi','uze','uzu','vac', 'vad','vae','vai','vaj','val','van','vaq','var','vas','vaz','vea','vec', 'ved','veg','veh','vei','vej','vel','ven','ver','vev','vey','via','vic', 'vid','vie','vig','vij','vil','vin','vio','viq','vir','vis','vit','viv', 'viz','vol','vos','voz','vue','vuo','xim','xiq','xoc','yan','yaq','yar', 'yba','yca','yce','ydi','yeb','yed','yeg','yej','yep','yer','yes','yev', 'ygl','ygn','ygu','yll','ync','ynd','yne','ynf','yni','yno','ynz','yod', 'yog','yor','ypa','ypi','yra','yri','yru','ysa','yse','ysl','ysq','ytu', 'yub','yuc','yud','yul','yun','yur','yus','yva','yza','yze','yzn','yzq', 'zab','zac','zad','zaf','zag','zal','zam','zan','zap','zar','zat','zau', 'zav','zay','zaz','zea','zeb','zed','zeg','zel','zen','zep','zeq','zer', 'zet','zev','zim','zir','zol','zor','zoz','zua','zub','zug','zul','zum', 'zun','zur','zuv','zuz' ] end