Saturday, April 14, 2012

Unicode character set importer for Python

import os
import os
import shelve

class Keycachedata:

    def deleteKey(self, key):
     self.key = key
     self.__checkreadFile()
     if self.__verifyKey():
            del self.cachedatafile[self.key]
        #print('key data saved.')
        self.keydatafile.close()

    def savekey(self, key, keydata):
        self.key = key
        self.__checkreadFile()
        #print('cachefile prior to keydata mod:',self.keydatafile)
        if self.__verifykey():
            print('Key verified.')
            self.keydatafile[self.key] = keydata
        print('key data saved.')
        #print(self.keydatafile)
        self.keydatafile.close()

    def savekeydata(self, keydata):
     self.__checkreadFile()
        #self.keydatafile[self.key] = keydata
        if self.__verifykey():
            self.keydatafile[self.key] = keydata
            print('key data saved.')
        self.keydatafile.close()

    def addkey(self, key):
        self.key = key
        self.__checkreadFile()
        keycheck = self.__verifykey()
        if not keycheck:
            self.keydatafile[self.key] = {}
            print('key added.')
        else:
            print('key already exists.')
        return keycheck
        self.keydatafile.close()
            
    def retrievekey(self, key):
        self.key = key
        self.__checkreadFile()
        self.keydata = {}
        if self.__verifykey():
            self.keydata = self.keydatafile[self.key]
        self.keydatafile.close()
        return self.keydata

    def retrievekeyname(self):
        return self.key

    def retrievekeylist(self):
        keylist = []
        self.__checkreadFile()
        for key in self.keydatafile:
           keylist.append(key) 
        self.keydatafile.close()
        return keylist

    def iskeydatempty(self):
        check = False
        self.__checkreadFile()
        if len(self.keydatafile) == 0:
            check = True
        self.keydatafile.close()
        return check

    def returnFirstkey(self):
        self.__checkreadFile()
        keylist = list(self.keydatafile.keys())
        firstkeykey = keylist[0]
        self.keydatafile.close()
        return [firstkeykey, self.keydatafile[firstkeykey]]
            
    def __verifykey(self):
        check = False
        if self.key in self.keydatafile:
            check = True
        return check
        
    def verifykey(self, keyname):
        check = False
        self.__checkreadFile()
        if keyname in self.keydatafile:
            check = True
        self.keydatafile.close()
        return check        
        
    def __readFile(self):
     print('hitting read shelve file on keycachedata class.')
        self.keydatafile = shelve.open(self.pathfile, 'w')


    def __createFile(self):
     print('hitting create shelve file on keycachedata class.')
        self.keydatafile = shelve.open(self.pathfile, 'n')
      
        

        
    def __checkreadFile(self):
        file = ''
        if os.name == 'nt':
            file = '\\qacrossworddata'
        elif os.name == 'posix':
            file = '/qacrossworddata'
        self.pathfile = self.path+ file
        if os.access(self.pathfile, os.F_OK):
                if os.access(self.pathfile, os.R_OK):
                    if os.access(self.pathfile, os.W_OK):
                        self.__readFile()
                    else:
                        self.__createFile()
                else:
                    self.__createFile()
        else:
            self.__createFile()
    
##    def checkkey(self):
##        
##        self.__checkreadFile()
##        self.__verifykey()
        
    def __init__(self):
        self.path = os.getcwd()
        self.key = ''
        #self.__checkreadFile()

KEYSPOP = 15
KEYSPOPALT = 14 #alternate keys detection

#the following site provide unicode tables that can be used with this
#importer
#http://www.utf8-chartable.de


#Haven't included try exceptions in this program, so every file
#should be consistently valid and of the same format for this
#to work.  i.e. any table written with more then 4 tab separated keys
# and less then 3 tab separated keys
# will not work in this file reader.  If so, you'll need to restructure
#tables or code differently, as I've structured reader around
#website tables mentioned above.



def startendwhitespaceremove(stringline):
        retstring = ''
        stringarray = stringline.split()
        if len(stringarray) == 1:
            retstring = stringarray[0]
        elif len(stringarray) > 1:
            retstring = stringarray[0]
            for string in stringarray[1:len(stringarray)]:
                retstring += ' ' + string
        return retstring

def preappendunicodedata(filename = '', quickfinddict={}):


    def bisectionsearch(intval, keyset):
                returnval = []
        
                lkeyset = keyset[0: len(keyset)/2]
                ukeyset = keyset[len(keyset)/2: len(keyset)]
        
                minlkey = lkeyset[0][0]
                maxlkeyrange = lkeyset[len(lkeyset)-1]
                maxlkey = maxlkeyrange[len(maxlkeyrange)-1]
                minukey = ukeyset[0][0]
                maxukeyrange = ukeyset[len(ukeyset)-1]
                maxukey = maxukeyrange[len(maxukeyrange)-1]
                if not intval < minlkey and not intval > maxlkey:
                        if not len(lkeyset) == 1:
                               returnval = bisectionsearch(intval, lkeyset)
                        else:
                               return lkeyset
                elif not intval < minukey and not intval > maxukey:
                        if not len(ukeyset) == 1:
                               returnval = bisectionsearch(intval, ukeyset)
                        else:
                               return ukeyset

                return returnval        
                            
    
    #here we set data up in a dictionary for referencing,
    #making one pass through the data set for key value
    #pairing.  Here we set up a 2 dimension tuple which
    #records the name for the unicode character and its
    #indexing in set of keys...this index will aid us
    #quickly setting up block address searching later.
    a = open(filename)
    dicta = {}
    blocksdict ={}

    name = ''
    uc = '' #this is unicode character point representation
    #this will still needed to be converted into a form
    #recognizable by python after read parsing unicode tables.
    
    count = 0
    for line in a:

        if len(line.split(';')) == KEYSPOP:
           unilist = line.split(';')
           uc, name, case = unilist[0], unilist[1], unilist[2]
           #This import design is structured for
           #present unicode data format information
           #please consult tables to verify for correct
           #importation.
           
        elif len(line.split(';')) == KEYSPOPALT:
            
           unilist = line.split(';')
           uc, name, case = unilist[0], unilist[1], unilist[2]
        if not len(name) == 0:
           
           intval = int('0x'+uc,0)

           rangekey = bisectionsearch(intval,sorted(quickfinddict))

           if len(rangekey) == 1:
                   if rangekey[0] in quickfinddict:
                           blockident, blockname = quickfinddict[rangekey[0]]
                           dicta[uc] = (name, count, case, blockident)
       
                           blockident = tuple(blockident)
                           if blockident in blocksdict:
                                   blockdict = blocksdict[blockident]
                                   blockdict[uc] = name
                           else:
                                   blocksdict[blockident] = {}
                                   blockdict = blocksdict[blockident]
                                   blockdict[uc] = name
                   
        count += 1
    return (dicta,blocksdict)

def readunicodefile(lname = 'LATIN', unicodedat = {}, blocksdict = {},
                    block = (0,0)):

    def findnearest(blockpos, minmaxswitch):
            
        #convert hex to decimal
        blockposa = '0x'+blockpos
        print('blockpos ', blockposa)
##        print(blockposa)
        blockposadec = int(blockposa, 0)
        find = False
        print(minmaxswitch)
        while not find:
            if minmaxswitch:
                blockposadec -= 1
            else:
                blockposadec += 1
            blockposahexstring = hex(blockposadec)
            blockposa = blockposahexstring[2:len(blockposahexstring)]
            if 'x' in blockposa:
               blockposa = blockposa.split('x')[1]
            elif 'X' in blockposa:
               blockposa = blockposa.split('X')[1]
            blockposa = blockposa.upper()
            
            if blockposa in unicodedat:
                find = True
                
##            print('Find status: ', blockposa)
        return blockposa
   
    #a = open(filename)
    dicta = {}
    dicta[lname+'_LOWER'] = {}
    dicta[lname+'_UPPER'] = {}
    dicta[lname+'_OTHER'] = {}
##    dicta[lname+'_LOWER_REV'] = {}
##    dicta(lname+'_UPPER_REV'] = {}
    name = ''
    uc = '' #this is unicode character point representation
    #this will still needed to be converted into a form
    #recognizable by python after read parsing unicode tables.
    #code blocks table doesn't necessarily garauntee existence in
    #table for a given range a respective table element, so we may
    #may need in appropriate blocks range find nearest minimal
    #maximal element

    blockdict = blocksdict[tuple(block)]
    for uc in blockdict:
            

        name = unicodedat[uc][0]
        case = unicodedat[uc][2]
        
        LCLstring = lname + ' CAPITAL LETTER'
        LSLstring = lname + ' SMALL LETTER'
        Lstring = lname
        if not len(name) == 0:
            if 'Lu' in case:

                chardict = dicta[lname+'_UPPER']
                ucconv = '0x'+uc
                ucconv = eval('unichr('+ucconv+')')
                ucconv = ucconv.encode('utf-8')
                chardict[len(chardict)+1] = ucconv
            elif 'Ll' in case:        

                chardict = dicta[lname+'_LOWER']
                ucconv = '0x'+uc
                ucconv = eval('unichr('+ucconv+')')
                ucconv = ucconv.encode('utf-8')
                chardict[len(chardict)+1] = ucconv
            elif 'Lo' in case:
                chardict = dicta[lname+'_OTHER']
                ucconv = '0x'+uc
                ucconv = eval('unichr('+ucconv+')')
                ucconv = ucconv.encode('utf-8')
                chardict[len(chardict)+1] = ucconv

    
    return dicta

def createkeyboardrows(dicta, lname):
    kblangpack = {}
    kblangpack['LOWER'] = {}
    kblangpack['UPPER'] = {}
    #Idea here is to restructure dictionaries into
    #rowdictionaries that can easily be passed in a 2 dimensional
    #array type form
    #First several rows are designated to non variant lower or upper
    #case letter forms, variant forms are those of the base character
    #plus accenting, hooks, and so forth.

    #First collect pop data which will determine row dict creation pops
    def countpop():
        count = 0
        for chardict in dicta:
            count += len(dicta[chardict])
        return count

    #next create row dictionaries based upon pops
    #allocating 10 column spacing here

    def createkbrowdicts():

        #  hmmm...above may not be necessary
        #start building on lower base dict
        def lowerupperkey(keyswitch):
            chardict = {}
            base_keyswitchdict = dicta[lname+'_'+keyswitch]
            for charkey in base_keyswitchdict:
                if not len(chardict) < 11:
                    lowerdict = kblangpack[keyswitch]
                    lowerdict[len(lowerdict)+1] = chardict
                    kblangpack[keyswitch] = lowerdict
                    chardict = {}
                chardict[len(chardict)+1] = base_keyswitchdict[charkey]
            lowerdict = kblangpack[keyswitch]
            lowerdict[len(lowerdict)+1] = chardict
            kblangpack[keyswitch] = lowerdict                

        def otherkey(keyswitch):
            chardict = {}
            keydict = dicta[lname + '_OTHER']
            for charkey in keydict:
            #need to correct control expressions here, 
            #appears to overwrite previous kblangpack work,
            #needs to append not overwrite.
                if not len(chardict) < 11:
                    lowerdict = kblangpack[keyswitch]
                    lowerdict[len(lowerdict)+1] = chardict
                    kblangpack[keyswitch] = lowerdict
                    chardict = {}                        
                chardict[len(chardict)+1] = keydict[charkey]
            lowerdict = kblangpack[keyswitch]
            lowerdict[len(lowerdict)+1] = chardict
            kblangpack[keyswitch] = lowerdict          

                  
        lowerupperkey('LOWER')
        chardict = lowerupperkey('UPPER')
        otherkey('LOWER')
        otherkey('UPPER')
        

    createkbrowdicts()                    
    return kblangpack



def blockimporter(blockfilepath = ''):
    returndict = {}
    quickfinddict = {}
    a = open(blockfilepath)
    for line in a:
        linesplit = line.split(';')
        if len(linesplit) == 2:
            block, name = linesplit
            if not len(block) == 0:
                block = startendwhitespaceremove(block)
                blocklist = block.split('..')
                
            if not len(name) == 0:
                name = startendwhitespaceremove(name)
                name = name.upper()
            returndict[name] = blocklist
            block1 = int('0x'+blocklist[0],0)
            block2 = int('0x'+blocklist[1],0)
            quickfinddict[tuple(range(block1, block2+1))] = (blocklist, name)
    return (returndict,quickfinddict)

def importer(blockfilepath, unicodedatfilepath):
    blockdict,quickfinddict = blockimporter(blockfilepath)
    unicodedat,blocksdict = preappendunicodedata(unicodedatfilepath,
                                                 quickfinddict)
    for blockname in blockdict:
        #k block name needs possible further parsing since
        #it is used in our search filtration algorithm
        #hmm...we could do away with this completely since
        #lower and upper case delineations are provided
        #so we just need to import and read character case
        #coding here.
        
        block = blockdict[blockname]
        print('processing: ', blockname)
        dicta = readunicodefile(blockname , unicodedat, blocksdict,
                                block)
        dictb = createkeyboardrows(dicta, blockname)
 
        keydatamanager = Keycachedata()
        langpackdict = keydatamanager.retrievekey('Language')

        langpackdict[blockname] = dictb
        keydatamanager.savekey('Language', langpackdict)




importer(stringpathtounicodeblocksfilegoeshere,
         stringpathtounicodedatafilegoeshere) 
 
 
This program basically creates a character set library from existing unicode table data in utf-8 format, not sure if library already exists in Ubuntu, so I may have re invented something of the wheel here doing this, but this nicely stores character set data of case type forms 'Lu', 'Ll', and 'Lo' (upper, lower, and other)...this doesn't include punctuation, control characters and a range of other types...you could manually tweak code filters for doing this if you like using the source above. 

Also I haven't structured for parsing avoidance on unicode table comment lines, simply remove these, and you may need to remove last blocks line as well from blocks table. 

Also the shelve object file is stored in the same directory that the program is run on.  You will need to create the key 'Language' prior to running in order to save table data to this shelve file.  You can do this using the usual python console commands:

>>import shelve
>>a = shelve(stringpathtoshelvefilenamehere, 'n')
>>a['Language'] = {}

or use the Keycachedata class above for key creation...using filename for module importation this can be done as follows:

>>from filename import Keycachedata
>>a = Keycachedata()
>>a.addkey('Language')

Then run script above.

To access this data, you can do so using Keycachedata module as follows:

>>a = Keycachedata()
>>Languagedict = a.retrievekey('Language')

Then Languagedict will contain all such imported language data table infomation, so you simply need to provide the language key which should be all caps, or you can find keys at console by:

>>list(Languagedict.keys())

No comments:

Post a Comment

Oblivion

 Between the fascination of an upcoming pandemic ridden college football season, Taylor Swift, and Kim Kardashian, wildfires, crazier weathe...