Skip to content

Commit

Permalink
Merge pull request #7 from hasadna/with-calamine
Browse files Browse the repository at this point in the history
With calamine
  • Loading branch information
Guy-Galil authored Dec 26, 2024
2 parents e975003 + 1fa5821 commit 0c7622d
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 58 deletions.
10 changes: 5 additions & 5 deletions djang/importer/field_mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@
{
"class_name": "importer.models.AssetDetails",
"field_name": "proceeds",
"column_title": ["תשואה לפדיון","תשואה לפידיון"],
"column_title": ["תשואה לפדיון","תשואה לפידיון","תשואה לפידיון לפי שווי הוגן"],
"type": "extracted",
"ref_name":"details"
},
Expand Down Expand Up @@ -238,7 +238,7 @@
{
"class_name": "importer.models.AssetDetails",
"field_name": "percent_of_asset_channel",
"column_title": ["שעור מנכסי אפיק השקעה","שיעור מנכסי אפיק ההשקעה","שעור מנכסי אפיק ההשקעה"],
"column_title": ["שעור מנכסי אפיק השקעה","שיעור מנכסי אפיק ההשקעה","שעור מנכסי אפיק ההשקעה","שיעור מנכסי אפיק ה השקעה"],
"type": "extracted",
"ref_name":"details"
},
Expand Down Expand Up @@ -315,7 +315,7 @@
{
"class_name": "importer.models.AssetDetails",
"field_name": "average_interest_rate",
"column_title": ["שיעור ריבית ממוצע"],
"column_title": ["שיעור ריבית ממוצע","שעור ריבית ממוצע"],
"type": "extracted",
"ref_name":"details"
},
Expand All @@ -329,7 +329,7 @@
{
"class_name": "importer.models.AssetDetails",
"field_name": "commitment",
"column_title": ["סכום ההתחייבות"],
"column_title": ["סכום ההתחייבות","יתרת התחייבות באלפי ₪"],
"type": "extracted",
"ref_name":"details"
},
Expand All @@ -343,7 +343,7 @@
{
"class_name": "importer.models.AssetDetails",
"field_name": "coordinated_cost",
"column_title": ["עלות מתואמת","עלות מותאמת"],
"column_title": ["עלות מתואמת","עלות מותאמת","עלות"],
"type": "extracted",
"ref_name":"details"
},
Expand Down
14 changes: 12 additions & 2 deletions djang/importer/management/commands/import_from_folder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from django.core.management.base import BaseCommand, CommandError
from importer import models# TODO remove
from importer.services import xsls_ingester
from importer.services import ingester_with_calamine
import os
import io
import datetime
Expand All @@ -24,6 +25,7 @@ def handle(self, *args, **options):
m=options["mode"]

count = failed = 0
pyxl = calamine = 0
self.stdout.write(str(datetime.datetime.now())+" About to import files from %s" %options["path"])
files = os.listdir(p)
files = [os.path.join(p, f) for f in files if not f.startswith(".")]
Expand All @@ -33,12 +35,20 @@ def handle(self, *args, **options):
ingester = xsls_ingester.xls_ingester()
ingester.force = options["force_override"] is not None
self.stdout.write("ingest %s" %filename )
if ingester.ingest(filename, xls):
status=ingester.ingest(filename, xls)
if not status:
ingester = ingester_with_calamine.Ingester_whith_calamine()
self.stdout.write("ingest with calamine %s" %filename )
status=ingester.ingest(filename, xls)
calamine +=1
if status:
count +=1
self.stdout.write('count={0}'.format(count))
else:
failed +=1
#xsls_ingester.xls_import(p, None)
self.stdout.write(
self.style.SUCCESS(str(datetime.datetime.now())+'Successfully imported {0} files, {1} files failed'.format(count, failed))
self.style.SUCCESS(str(datetime.datetime.now())+
'Successfully imported {0} files, {1} files failed, {2}'.format(count, failed, calamine))
)

1 change: 1 addition & 0 deletions djang/importer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,4 @@ class UnmappedFields(models.Model):
tab_name = models.CharField(max_length=255)
field = models.CharField(max_length=255)
date = models.DateTimeField(auto_now=True)
comment = models.CharField(max_length=255,null=True)
40 changes: 34 additions & 6 deletions djang/importer/services/ingester_with_calamine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@

from python_calamine import CalamineWorkbook
import importer
from importer import models
from importer.services import xsls_ingester
import traceback

Expand All @@ -13,12 +15,39 @@ def get_sheetnames(self, wb):
return wb.sheet_names

def getSheet(self, wb, sn):
if isinstance(sn,(str)):
ws = wb.get_sheet_by_name(sn)
return ws, sn
for sn1 in sn:
ws = wb.get_sheet_by_name(sn1)
if ws != None:
return ws
return ws, sn1
return None

def find_title_row(self, ws, field_name):
workarray = ws.to_python(skip_empty_area=True)
i = 0
for row in ws.iter_rows():
# print(field_name)
for cell1 in row:
if cell1 is not None:
if cell1 in field_name:
# print("cell="+cell1.value)
return i
i=i+1

def get_row(self, workbook, sheet, row):
return workbook.get_sheet_by_name(sheet).to_python(skip_empty_area=True)[row]

def get_rows(self, workbook, sheet, first_row):
return workbook.get_sheet_by_name(sheet).to_python(skip_empty_area=True)[first_row:]

def get_value(self, cel):
return cel

def get_cell_index(self, **kwargs):
return kwargs.get("title_row").index(kwargs.get("cell"))+1

def parse_first_tab(self, wb, sn, tab):
# from first tab get report date, company, track name and track code
# optinally get report summary as well
Expand All @@ -27,11 +56,10 @@ def parse_first_tab(self, wb, sn, tab):
if field["type"] == 'generated':
continue
else:
worksheet = self.getSheet(wb, sn)
worksheet, sn= self.getSheet(wb, sn)
if worksheet is not None:
workarray = worksheet.to_python(skip_empty_area=False)

for row in workarray[0:3]:
for row in workarray[0:4]:
i = -1
for cell in row:
i += 1
Expand Down Expand Up @@ -59,7 +87,7 @@ def parse_first_tab(self, wb, sn, tab):
break
self.save_first_tab()

return
return worksheet.name

def ingest(self, filename, file_stream):
try:
Expand All @@ -69,7 +97,7 @@ def ingest(self, filename, file_stream):
return True
except ValueError:

#traceback.print_exc()
traceback.print_exc()
print("report already exists")
return False

Expand Down
122 changes: 77 additions & 45 deletions djang/importer/services/xsls_ingester.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,19 @@ def find_sn(self, wb):
if sn != "any":
sn2 = self.parse_first_tab(wb, sn, tab)
sheet_name_array.remove(sn2)
ws = wb[sn2]
fields = tab["fields"]
for field in fields:
if field["type"] == "generated":
continue
#ws = wb[sn2]
#fields = tab["fields"]
#for field in fields:
# if field["type"] == "generated":
# continue
# print(field["column_title"])
# print(tab)
return sheet_name_array, tab

def find_title_row(self, ws, field_name):
# print(field_name)
#(str(ws))
for row in ws.iter_rows():
#print("+++++++++++++++++++++++++ after iter")
for cell1 in row:
if cell1.value is not None:
if cell1.value in field_name:
Expand Down Expand Up @@ -85,12 +86,26 @@ def put_header_fields(self, reference_objects):
self.reference_objects, field, val)

def getSheet(self, wb, sn):
if isinstance(sn,(str)):
return wb[sn] , sn
for sn1 in sn:
if sn1 in wb:
sn2 = sn1
break
return wb[sn2] , sn2

def get_row(self, workbook, sheet, row):
return next(workbook[sheet].iter_rows(min_row=row, max_row=row, values_only=False))

def get_rows(self, workbook, sheet, first_row):
return workbook[sheet].iter_rows(min_row=first_row, values_only=False)

def get_value(self, cell):
return cell.value

def get_cell_index(self, **kwargs):
return kwargs.get("cell").column

def parse_first_tab(self, wb, sn, tab):
# from first tab get report date, company, track name and track code
# optinally get report summary as well
Expand Down Expand Up @@ -209,49 +224,61 @@ def parse_spreadsheet(self, wb):
column_idxs = list()
column_list = []
title_row = 0
found = 0
# find title row in tab
for field in tab["fields"]:
if field["type"] == 'generated':
continue
if field["type"] == 'extracted':
if title_row == 0:
tr = self.find_title_row(wb[sh], field["column_title"])
ws, sn2=self.getSheet(wb,sh)
tr = self.find_title_row(ws, field["column_title"])
if tr is not None:
title_row = tr
break
# find row of values
for row in wb[sh].iter_rows(min_row=title_row, max_row=title_row, values_only=False):
for cell in row:
found = 0
if cell.value is not None:
if str(cell.value).startswith("{PL}PickLst"):
break
# in some of the reports there are multiple * characters of column titles as pointers to comments
stripped = str(cell.value).replace('*', '').strip()
for field in tab["fields"]:
if field["type"] == 'reference':
found += 1
column_idxs.append(found)
column_list.append(field)
if field["field_name"] == "category":
found += 1
column_idxs.append(found)
column_list.append(field)
if stripped in field["column_title"]:
found += 1
column_idxs.append(cell.column)
column_list.append(field)
if found == 3:
break
if found < 3:
# field not found in mapping - log
uf = importer.models.UnmappedFields()
uf.file_name = self.file
uf.tab_name = sh
uf.field = str(cell.value)
uf.save()
#
# create an array of fields by finding the field title in title row
# create an array of indexes of the fields in the worksheet
title_row = self.get_row(wb, sh, tr)
for cell in title_row:
if cell == '':
break
cell_value = self.get_value(cell)
if cell_value is None:
continue
cell_index = self.get_cell_index(cell=cell, title_row=title_row) #title_row.index(cell)
if cell_value is not None:
if str(cell_value).startswith("{PL}PickLst"):
break
# in some of the reports there are multiple * characters of column titles as pointers to comments
stripped = str(cell_value).replace('*', '').strip()
for field in tab["fields"]:
found1 = False
if found < 2:
if field["type"] == 'reference':
found += 1
column_idxs.append(found)
column_list.append(field)
if field["field_name"] == "category":
found += 1
column_idxs.append(found)
column_list.append(field)
if stripped in field["column_title"]:
found1 = True
column_idxs.append(cell_index)
column_list.append(field)
break
if not found1:
# field not found in mapping - log
uf = importer.models.UnmappedFields()
uf.file_name = self.file
uf.tab_name = sh
uf.field = str(cell_value)
uf.save()
done = False
for row in wb[sh].iter_rows(min_row=title_row+1):
# For each row in the worksheet find the values by using the index list
# If value is valid add the value and field to the objects in the model
for row in self.get_rows(wb, sh, tr+1):
skip = False
# clear details object as each row is a new details record
if "details" in self.reference_objects:
Expand All @@ -264,8 +291,16 @@ def parse_spreadsheet(self, wb):
elif field["field_name"] == "category":
value = sh
else:
cell1 = row[column_idxs[i]-1]
value = str(cell1.value)
ind = column_idxs[i]
cell1 = row[ind-1]
value = str(self.get_value(cell1))
# special treatment - stock_name must be populated or row is not a data row
if "stock_name" == field["field_name"] and (value is None or value == 'None' or value == ''):
skip = True
break
#if cell1 is None or cell1 == 'None' or cell1 == '':
# break
# if value is a formula - currently only relevant in pyxl
if value.startswith("="):
try:
# field is a formula field - calculate it
Expand All @@ -280,6 +315,7 @@ def parse_spreadsheet(self, wb):
"\n\r+++"+str(e)
fni.save()
break
# apply format - is it really needed?
f_format = cell1.number_format
if f_format == "0.00%":
# match f_format:
Expand All @@ -291,10 +327,6 @@ def parse_spreadsheet(self, wb):
# round to 2 decimal points
value = f"{value:.2f}"
# print(self.file+","+sh+ "-"+cell1.coordinate+"-"+str(value) +" format ="+f_format )
# special treatment - stock_name must be populated or row is not a data row
if "stock_name" == field["field_name"] and (value is None or value == 'None' or value == ''):
skip = True
break
# special treatment - row starting with * signals end of data
if '*' in str(value):
done = True
Expand Down

0 comments on commit 0c7622d

Please sign in to comment.