def parseDate(stringData,dataRgex=['(\w+\s\d+\,\s\d+)','(\w+\s\d+\,\d+)','(\w+\s\d+\s\d+)','(\d+\s\w+\s\d+)','(\d+\/\d+\/\d+)','(\d+\.\d+\.\d+)','(\d+\-\w+\-\d+)','(\d+\'\w+\s\d+)']): #print stringData getDate = [] for dataList in dataRgex: res_find = re.findall(dataList, stringData, re.IGNORECASE) for dateData in res_find: try: dparser.parse(dateData) return dateData except Exception as e: pass return getDate #dataRgex data type listdataTime = '(\d+\:\d+\:\d+ \w+)|(\d+\:\d+\:\d+\w+)|(\d+\:\d+ \w+)|(\d+\:\w+)'def parseDateTime(stringData,dataRgex=['(\w+\s\d+\,\s\d+)','(\w+\s\d+\,\d+)','(\w+\s\d+\s\d+)','(\d+\s\w+\s\d+)','(\d+\/\d+\/\d+)','(\d+\.\d+\.\d+)','(\d+\-\w+\-\d+)','(\d+\'\w+\s\d+|\d+\'\w+)']): getTime = [] for dataList in dataRgex: res_find = re.findall(dataList, stringData, re.IGNORECASE) for dateData in res_find: if(res_find): for dataT in dateData: try: dparser.parse(dataT) getTime.append(dataT) except Exception as e: pass return getTime def parseGSTIN(stringData,dataRgex=['(.*)']): getGSTIN = [] getData = ['GSTIN','STIN', 'gstin', 'GST'] for data in getData: for dataList in dataRgex: try: res_find = re.search(data+dataList, stringData) getGSTIN.append(res_find.group(1)) except Exception as e: pass return getGSTIN def parseAmount(stringData,dataRgex=['(.*)']): returnList= [] getData = ['TOTAL', 'TOTAL', 'Total', 'Total', 'total', 'total', 'TotalAmount', 'Total Amount', 'totalamount'] getInvoiceNumber = [] matchKey = "" invNumber = "" listMatch = [] getLen = "" for data in getData: if ((str(data) in str(stringData)) and str(data) != 'None' and str(stringData) != 'None'): getLen = stringData.count(str(data)) matchKey = (str(data)) break getPos = [m.start() for m in re.finditer(matchKey, stringData)] getDataLoop = [stringData[dd:] for dd in getPos] for stringData in getDataLoop: invNumberList = stringData.split(matchKey, 1) invString = invNumberList[1] invList = invString.split(' ', 1) if (len(invList) > 1): invString = invList[1] else: invString = invList[0] finalInvList = invString.split(' ') invNumber = finalInvList[0].split('\n')[0] returnList.append(invNumber) return returnList def parseInvoiceNumber(stringData,dataRgex=['(.*)']): getInvoiceNumber = [] getData = ['Ilnvoice No','Invoice Number:','Invoice No','INVOICE NO.','InvoiceNumber','Invoice Number','InvNo','Inv. No.','InvoiceNo','Invoice No.','Invoice Number','lnvolceNo','lnvolce No.','lnvolceNumber','lnvolce Number','noviceNo'] matchKey = "" invNumber = "" for data in getData: if((str(data) in str(stringData)) and str(data) != 'None' and str(stringData) !='None'): matchKey = str(data) break invNumberList = stringData.split(matchKey,1) invString = invNumberList[1] invList = invString.split(' ',1) if(len(invList)>1): invString = invList[1] else: invString = invList[0] finalInvList = invString.split(' ') invNumber = finalInvList[0].split('\n')[0] return invNumber def parseCurrency(stringData,dataRgex=['(.*)']): getCurrency = [] getData = ['TOTAL','TOTAL','Total','Total','total','total','TotalAmount','Total Amount','totalamount','total amount','totalAmount'] for data in getData: for dataList in dataRgex: try: res_find = re.search(data+dataList, stringData) getCurrency.append(res_find.group(1)) except Exception as e: pass #p = re.findall(r"[INR]|[$]",str(getCurrency[0])) p = [] curr = ['$','INR','1NR','EUR','AUD','CAD','SGD','GBP'] for c in curr: for e in getCurrency: if str(c) in str(e) and str(c) != 'None' and str(e) != 'None': p = str(c) return [p] else: p = "" return p def getPanNumber(stringData,dataRgex=['[A-Za-z]{5}\d{4}[A-Za-z]{1}','[A-Za-z]{4}\d{4}[A-Za-z]{2}']): try: for dataR in dataRgex: res_find = re.findall(dataR, stringData) except Exception as e: print (e) return res_find def getAadharNumber(stringData,dataRgex=['(?<!\d)\d{4}\s\d{4}\s\d{4}(?!\d)']): try: res_find = re.findall(dataRgex[0], stringData) except Exception as e: print (e) return res_find
Friday, 30 March 2018
python parser
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment