Friday 30 March 2018

python parser

def parseDate(stringData,dataRgex=['(\w+\s\d+\,\s\d+)','(\w+\s\d+\,\d+)','(\w+\s\d+\s\d+)','(\d+\s\w+\s\d+)','(\d+\/\d+\/\d+)','(\d+\.\d+\.\d+)','(\d+\-\w+\-\d+)','(\d+\'\w+\s\d+)']):
    #print stringData    getDate = []
    for dataList in dataRgex:
        res_find = re.findall(dataList, stringData, re.IGNORECASE)
        for dateData in res_find:
            try:
                dparser.parse(dateData)


                return dateData
            except Exception as e:
                pass    return getDate



#dataRgex data type listdataTime = '(\d+\:\d+\:\d+ \w+)|(\d+\:\d+\:\d+\w+)|(\d+\:\d+ \w+)|(\d+\:\w+)'def parseDateTime(stringData,dataRgex=['(\w+\s\d+\,\s\d+)','(\w+\s\d+\,\d+)','(\w+\s\d+\s\d+)','(\d+\s\w+\s\d+)','(\d+\/\d+\/\d+)','(\d+\.\d+\.\d+)','(\d+\-\w+\-\d+)','(\d+\'\w+\s\d+|\d+\'\w+)']):

    getTime = []
    for dataList in dataRgex:
        res_find = re.findall(dataList, stringData, re.IGNORECASE)

        for dateData in res_find:
            if(res_find):
                for dataT in dateData:
                    try:
                        dparser.parse(dataT)
                        getTime.append(dataT)
                    except Exception as e:
                        pass    return getTime



def parseGSTIN(stringData,dataRgex=['(.*)']):
    getGSTIN = []
    getData = ['GSTIN','STIN', 'gstin', 'GST']
    for data in getData:
        for dataList in dataRgex:
            try:
                res_find = re.search(data+dataList, stringData)
                getGSTIN.append(res_find.group(1))
            except Exception as e:
                pass    return getGSTIN



def parseAmount(stringData,dataRgex=['(.*)']):
    returnList= []
    getData = ['TOTAL', 'TOTAL', 'Total', 'Total', 'total', 'total', 'TotalAmount', 'Total Amount', 'totalamount']
    getInvoiceNumber = []

    matchKey = ""    invNumber = ""    listMatch = []
    getLen = ""    for data in getData:
        if ((str(data) in str(stringData)) and str(data) != 'None' and str(stringData) != 'None'):
            getLen = stringData.count(str(data))
            matchKey = (str(data))
            break    getPos = [m.start() for m in re.finditer(matchKey, stringData)]
    getDataLoop = [stringData[dd:] for dd in getPos]
    for stringData in getDataLoop:
        invNumberList = stringData.split(matchKey, 1)
        invString = invNumberList[1]
        invList = invString.split(' ', 1)
        if (len(invList) > 1):
            invString = invList[1]
        else:
            invString = invList[0]

        finalInvList = invString.split(' ')
        invNumber = finalInvList[0].split('\n')[0]
        returnList.append(invNumber)
    return returnList



def parseInvoiceNumber(stringData,dataRgex=['(.*)']):
    getInvoiceNumber = []
    getData = ['Ilnvoice No','Invoice Number:','Invoice No','INVOICE NO.','InvoiceNumber','Invoice Number','InvNo','Inv. No.','InvoiceNo','Invoice No.','Invoice Number','lnvolceNo','lnvolce No.','lnvolceNumber','lnvolce Number','noviceNo']
    matchKey = ""    invNumber = ""
    for data in getData:
        if((str(data) in str(stringData)) and str(data) != 'None' and str(stringData) !='None'):
            matchKey = str(data)
            break    invNumberList = stringData.split(matchKey,1)
    invString = invNumberList[1]
    invList = invString.split(' ',1)
    if(len(invList)>1):
        invString = invList[1]
    else:
        invString = invList[0]

    finalInvList = invString.split(' ')
    invNumber = finalInvList[0].split('\n')[0]
    return invNumber


def parseCurrency(stringData,dataRgex=['(.*)']):
    getCurrency = []

    getData = ['TOTAL','TOTAL','Total','Total','total','total','TotalAmount','Total Amount','totalamount','total amount','totalAmount']
    for data in getData:
        for dataList in dataRgex:
            try:
                res_find = re.search(data+dataList, stringData)
                getCurrency.append(res_find.group(1))

            except Exception as e:
                pass    #p = re.findall(r"[INR]|[$]",str(getCurrency[0]))    p = []
    curr = ['$','INR','1NR','EUR','AUD','CAD','SGD','GBP']
    for c in curr:
        for e in getCurrency:


            if str(c) in str(e) and str(c) != 'None' and str(e) != 'None':
                p = str(c)

                return [p]
            else:
                p = ""    return p


def getPanNumber(stringData,dataRgex=['[A-Za-z]{5}\d{4}[A-Za-z]{1}','[A-Za-z]{4}\d{4}[A-Za-z]{2}']):
    try:
        for dataR in dataRgex:
            res_find = re.findall(dataR, stringData)
    except Exception as e:
        print (e)
    return res_find

def getAadharNumber(stringData,dataRgex=['(?<!\d)\d{4}\s\d{4}\s\d{4}(?!\d)']):
    try:
        res_find = re.findall(dataRgex[0], stringData)
    except Exception as e:
        print (e)
    return res_find

No comments:

Post a Comment