First, let me say: I am a python developer with very little django experience.
While contributing to a django project recently, one of my AJAX endpoints
began taking an unacceptable amount of time to return data. These are my
notes on analyzing and addressing the problem.
After the fact, one django specialist I spoke to about this reassured me there
were ways of alleviating the problem without dismissing the entire ORM for
this endpoint. I would be interested in measuring those, I still find my
results interesting.
Please ignore the domain-specific aspects of the code, it is enough to know
that the endpoint is used to populate a time series plot generated by c3.js,
which gets decidedly unsexy as the number of seconds for each update
increases, which appears to be due to the way related objects are looked up.
Once reaching the point of having a few dozen client utilities in place, AJAX
requests were taking 4+ seconds each
@ login_required
def forecastdemand_data ( request ):
"""returns data to populate timeseries plot @ /gas/forecastdemand/
Data returned looks something like:
{
"x-axis": ["2016-01-01", "2016-02-01", "2016-03-01", ...],
"Alpha - ConEd": [220, 240, 270, 250, 280],
"Alpha - PSEG": [180, 150, 300, 70, 120],
"Astral - ConEd": [200, 310, 150, 100, 180]
}
Unless there is a truthy value for a 'csv' GET parameter, in which case:
'Client - Utility','YYYY-MM-DD','YYYY-MM-DD',...
'Alpha - ConEd',220,240,270,...
"""
as_csv = bool ( request . GET . get ( 'csv' , '' ))
client_id = int ( request . GET . get ( 'client_id' , 0 ))
utility_id = int ( request . GET . get ( 'utility_id' , 0 ))
daily = int ( request . GET . get ( 'daily' , 1 ))
start = datetime . datetime . strptime ( request . GET [ 'from_date' ], ' % Y- % m- % d' ) . date ()
( wday1st , days_in_month ) = calendar . monthrange ( start . year , start . month )
if daily :
end = start . replace ( day = days_in_month )
else :
end = datetime . datetime . strptime ( request . GET [ 'to_date' ], ' % Y- % m- % d' ) . date ()
if request . user . is_staff :
# show staff all active clients
clients = Client . objects . filter ( active = True , gas = True )
utilities = Utility . objects . filter ( active = True , gas = True )
else :
# filter client list to logged-in user's associated clients
clients = ClientManager () . clients_for_user ( request . user )
clients = [ c for c in clients if c . gas ]
cus = ClientUtility . objects . filter ( client__in = clients )
utilities = [ cu . utility for cu in cus ]
if client_id :
clients = [ c for c in clients if c . clientid == client_id ]
if utility_id :
utilities = [ u for u in utilities if u . utilityid == utility_id ]
vectren = Utility . objects . get ( utilityabbrev__iexact = 'VCTN' )
kwargs = {
'client__in' : clients ,
'utility__in' : utilities ,
}
if daily :
kwargs [ 'day__gte' ] = start
kwargs [ 'day__lte' ] = end
else :
kwargs [ 'month__gte' ] = start
kwargs [ 'month__lte' ] = end
if utility_id and client_id :
# ConEd & NJNG don't offer utility_group granularity, but we still
# want the detail page to work for them, so just return utility_group=''
coned = Utility . objects . get ( utilityabbrev__iexact = 'ConEd' )
njng = Utility . objects . get ( utilityabbrev__iexact = 'NJNG' )
# Vectren's utility_group amounts do not sum to total daily demand
if utility_id in ( coned . utilityid , njng . utilityid , vectren . utilityid ):
pass
else :
# only select rows with utility group
kwargs [ 'utility_group__gt' ] = ''
else :
kwargs [ 'utility_group' ] = ''
if daily :
demands = DailyForecastedDemand . objects . filter ( ** kwargs ) . order_by ( 'day' )
else :
demands = MonthlyForecastedDemand . objects . filter ( ** kwargs ) . order_by ( 'month' )
client_utilities = set ()
demand_data = collections . defaultdict ( dict )
for fd in demands :
clientabbrev = fd . client . clientabbrev
utilityabbrev = fd . utility . utilityabbrev
utilitygrp = fd . utility_group
if daily :
key = fd . day . strftime ( ' % Y- % m- % d' )
else :
key = fd . month . strftime ( ' % Y- % m- % d' )
cu = '{} - {}' . format ( clientabbrev , utilityabbrev )
if utilitygrp :
if utilitygrp == 'Transco-Z6 (NY)' :
utilitygrp = 'Transco'
cu = '{} - {}' . format ( cu , utilitygrp )
dths = round ( fd . decatherms )
if daily and utility_id == vectren . utilityid :
if utilitygrp :
dths = ( dths , fd . max_dths )
demand_data [ key ][ cu ] = dths
client_utilities . add ( cu )
# keys is a list of YYYY-MM-DD date strings represented in the data set
keys = sorted ( demand_data . keys ())
client_utilities = sorted ( list ( client_utilities ))
# client_utilities is list of strings: ['ALPHA - ConEd', 'ALPHA - PSEG']
# months is list of strings: ['2015-06-01', '2015-07-01']
# demand_data is dict {'2015-10-01': {'AST - ConEd': 92.074}}
# not all clients are guaranteed to have data for all months
if as_csv :
fn = 'forecasted_demands_{}_{}.csv' . format ( start , end )
response = HttpResponse ( content_type = 'text/csv' )
response [ 'Content-Disposition' ] = 'attachment; filename="{}"' . format ( fn )
writer = csv . writer ( response )
if daily :
pretty_keys = [ datetime . datetime . strptime ( k , ' % Y- % m- % d' ) . strftime ( ' % d % b % Y' ) for k in keys ]
else :
pretty_keys = [ datetime . datetime . strptime ( k , ' % Y- % m- % d' ) . strftime ( ' % b % Y' ) for k in keys ]
writer . writerow ([ 'Client - Utility' ] + pretty_keys )
for cu in client_utilities :
row = [ cu ]
for key in keys :
row . append ( demand_data [ key ] . get ( cu , '' ))
writer . writerow ( row )
return response
data = {
"x-axis" : keys ,
}
for cu in client_utilities :
data [ cu ] = []
for key in keys :
data [ cu ] . append ( demand_data [ key ] . get ( cu , '' ))
return HttpResponse ( json . dumps ( data ), content_type = "application/json" )
I used a project called Geordi to
profile the execution time of each function on my local machine, and found the
following:
The django ORM version
shows 3.86 seconds total execution time, of which 99.31% is spent in the view
function itself, and 97.36% in the ORM’s related.__get__()
method. Note the
django framework overhead itself is responsible for only 0.51% of the total
execution time.
So, rather than rely so heavily on django’s ORM for the /data endpoint, I replaced
most of those queries with a couple of hand written queries, and the response time went
down to between 20ms-40ms
@ login_required
def forecastdemand_data ( request ):
"""returns data to populate timeseries plot @ /gas/forecastdemand/
Data returned looks something like:
{
"x-axis": ["2016-01-01", "2016-02-01", "2016-03-01", ...],
"Alpha - ConEd": [220, 240, 270, 250, 280],
"Alpha - PSEG": [180, 150, 300, 70, 120],
"Astral - ConEd": [200, 310, 150, 100, 180]
}
Unless there is a truthy value for a 'csv' GET parameter, in which case:
'Client - Utility','YYYY-MM-DD','YYYY-MM-DD',...
'Alpha - ConEd',220,240,270,...
"""
as_csv = bool ( request . GET . get ( 'csv' , '' ))
client_id = int ( request . GET . get ( 'client_id' , 0 ))
utility_id = int ( request . GET . get ( 'utility_id' , 0 ))
daily = int ( request . GET . get ( 'daily' , 1 ))
start = datetime . datetime . strptime ( request . GET [ 'from_date' ], ' % Y- % m- % d' ) . date ()
( wday1st , days_in_month ) = calendar . monthrange ( start . year , start . month )
if daily :
end = start . replace ( day = days_in_month )
else :
end = datetime . datetime . strptime ( request . GET [ 'to_date' ], ' % Y- % m- % d' ) . date ()
cus = get_clientutilities ()
clients = [ get_client_by_id ( cu [ 'client_id' ]) for cu in cus . values () ]
utilities = dict ([ ( cu [ 'utility_id' ], get_utility_by_id ( cu [ 'utility_id' ])) for cu in cus . values () ]) . values ()
if not request . user . is_staff :
# filter client & utility lists to logged-in user's associated clients
user_clients = ClientManager () . clients_for_user ( request . user )
user_clients = [ uc for uc in user_clients if uc . gas ]
user_client_ids = [ uc . client_id for uc in user_clients ]
clients = [ c for c in clients if c [ 'client_id' ] in user_client_ids ]
user_utilities = list ( itertools . chain ( * [ c [ 'utilities' ] for c in user_clients ] ))
user_utility_ids = list ( set ([ u [ 'utility_id' ] for u in user_utilities ]))
utilities = [ u for u in utilities if u [ 'utility_id' ] in user_utility_ids ]
client_filter = list ( set ([ c [ 'client_id' ] for c in clients ]))
if client_id and client_id in client_filter :
client_filter = client_id
utility_filter = list ( set ([ u [ 'utility_id' ] for u in utilities ]))
if utility_id and utility_id in utility_filter :
utility_filter = utility_id
kwargs = {
'client_id' : client_filter ,
'utility_id' : utility_filter ,
'date_start' : start ,
'date_end' : end ,
'daily' : daily ,
'monthly' : not daily ,
}
# if both utility_id & client_id are passed, we want to query detail
# records (LENGTH(utility_group) > 0), Except:
# * ConEd & NJNG don't offer utility_group granularity, but we still
# want the detail page to work for them, so just return utility_group=''
ug_except = []
# * Vectren's utility_group amounts do not sum to total daily demand,
# so display of the sum row is special cased in JS on client side
min_max_utilities = []
for u in utilities :
if u [ 'utilityabbrev' ] . upper () == 'VCTN' :
min_max_utilities . append ( u [ 'utility_id' ])
if u [ 'utilityabbrev' ] . upper () in ( 'CONED' , 'NJNG' , 'VCTN' ):
ug_except . append ( u [ 'utility_id' ])
if utility_id and client_id and not utility_id in ug_except :
# only select rows with utility group
kwargs [ 'utility_groups' ] = True
else :
kwargs [ 'utility_groups' ] = False
demands = get_forecasteddemands ( ** kwargs )
client_utilities = set ()
demand_data = collections . defaultdict ( dict )
for ( cuids , cudata ) in demands . iteritems ():
client_id = cuids [ 0 ]
client = get_client_by_id ( client_id )
clientabbrev = client [ 'clientabbrev' ]
utility_id = cuids [ 1 ]
utility = get_utility_by_id ( utility_id )
utilityabbrev = utility [ 'utilityabbrev' ]
# ['TETCO', '', 'TRANSCO']
for utilitygrp in cudata . keys ():
cu = '{} - {}' . format ( clientabbrev , utilityabbrev )
if utilitygrp and utility_id and client_id :
if utilitygrp == 'Transco-Z6 (NY)' :
utilitygrp = 'Transco'
cu = '{} - {}' . format ( cu , utilitygrp )
client_utilities . add ( cu )
# [('daily', '2016-10-01')]
for ( scope , datestr ) in cudata [ utilitygrp ] . keys ():
dths = cudata [ utilitygrp ][( scope , datestr )]
# Vectren daily detail chart uses (min, max) split-cell display
if daily and utilitygrp and utility_id in min_max_utilities :
# except for csv, which uses two rows
if as_csv :
demand_data [ datestr ][ cu + ' MIN' ] = round ( dths [ 0 ])
demand_data [ datestr ][ cu + ' MAX' ] = round ( dths [ 1 ])
else :
dths = ( round ( dths [ 0 ]), round ( dths [ 1 ]))
else :
demand_data [ datestr ][ cu ] = round ( dths )
# keys is a list of YYYY-MM-DD date strings represented in the data set
keys = sorted ( demand_data . keys ())
client_utilities = sorted ( list ( client_utilities ))
# client_utilities is list of strings: ['ALPHA - ConEd', 'ALPHA - PSEG']
# keys is list of strings: ['2015-06-01', '2015-07-01']
# demand_data is dict {'2015-10-01': {'AST - ConEd': 92.074}}
# not all clients are guaranteed to have data for all months/keys
if as_csv :
fn = 'forecasted_demands_{}_{}.csv' . format ( start , end )
response = HttpResponse ( content_type = 'text/csv' )
response [ 'Content-Disposition' ] = 'attachment; filename="{}"' . format ( fn )
writer = csv . writer ( response )
if daily :
pretty_keys = [ datetime . datetime . strptime ( k , ' % Y- % m- % d' ) . strftime ( ' % d % b % Y' ) for k in keys ]
else :
pretty_keys = [ datetime . datetime . strptime ( k , ' % Y- % m- % d' ) . strftime ( ' % b % Y' ) for k in keys ]
writer . writerow ([ 'Client - Utility' ] + pretty_keys )
for cu in client_utilities :
row = [ cu ]
for key in keys :
row . append ( demand_data [ key ] . get ( cu , '' ))
writer . writerow ( row )
return response
data = {
"x-axis" : keys ,
}
for cu in client_utilities :
data [ cu ] = []
for key in keys :
data [ cu ] . append ( demand_data [ key ] . get ( cu , '' ))
return HttpResponse ( json . dumps ( data ), content_type = "application/json" )
The minimal ORM version
(which does still use the ORM for a few queries, just not for the fetching of
forecasted demands, and all of the redundant lookups of related objects)
clocks in @ 0.034 seconds, of which the view function is responsible for only
23.15% (0.008013 clock time vs. 3.834297 clock time in the previous version).
Where the previous version’s decorators:45:<lambda>
method was responsible
for only 0.51% of the total (0.01954 clock time), in this version it is
57.37% (with a virtually identical clock time of 0.01986).