Django ORM Performance Analysis with Geordi

First, let me say: I am a python developer with very little django experience.

While contributing to a django project recently, one of my AJAX endpoints began taking an unacceptable amount of time to return data. These are my notes on analyzing and addressing the problem.

After the fact, one django specialist I spoke to about this reassured me there were ways of alleviating the problem without dismissing the entire ORM for this endpoint. I would be interested in measuring those, I still find my results interesting.

Please ignore the domain-specific aspects of the code, it is enough to know that the endpoint is used to populate a time series plot generated by c3.js, which gets decidedly unsexy as the number of seconds for each update increases, which appears to be due to the way related objects are looked up.

A note about performance

Once reaching the point of having a few dozen client utilities in place, AJAX requests were taking 4+ seconds each

@login_required
def forecastdemand_data(request):
    """returns data to populate timeseries plot @ /gas/forecastdemand/
    Data returned looks something like:
    {
        "x-axis": ["2016-01-01", "2016-02-01", "2016-03-01", ...],
        "Alpha - ConEd": [220, 240, 270, 250, 280],
        "Alpha - PSEG": [180, 150, 300, 70, 120],
        "Astral - ConEd": [200, 310, 150, 100, 180]
    }
    Unless there is a truthy value for a 'csv' GET parameter, in which case:
    'Client - Utility','YYYY-MM-DD','YYYY-MM-DD',...
    'Alpha - ConEd',220,240,270,...
    """
    as_csv = bool(request.GET.get('csv', ''))
    client_id = int(request.GET.get('client_id', 0))
    utility_id = int(request.GET.get('utility_id', 0))
    daily = int(request.GET.get('daily', 1))

    start = datetime.datetime.strptime(request.GET['from_date'], '%Y-%m-%d').date()
    (wday1st, days_in_month) = calendar.monthrange(start.year, start.month)

    if daily:
        end = start.replace(day=days_in_month)
    else:
        end = datetime.datetime.strptime(request.GET['to_date'], '%Y-%m-%d').date()

    if request.user.is_staff:
        # show staff all active clients
        clients = Client.objects.filter(active=True, gas=True)
        utilities = Utility.objects.filter(active=True, gas=True)
    else:
        # filter client list to logged-in user's associated clients
        clients = ClientManager().clients_for_user(request.user)
        clients = [ c for c in clients if c.gas ]
        cus = ClientUtility.objects.filter(client__in=clients)
        utilities = [ cu.utility for cu in cus ]

    if client_id:
        clients = [ c for c in clients if c.clientid == client_id ]

    if utility_id:
        utilities = [ u for u in utilities if u.utilityid == utility_id ]

    vectren = Utility.objects.get(utilityabbrev__iexact='VCTN')

    kwargs = {
        'client__in': clients,
        'utility__in': utilities,
    }

    if daily:
        kwargs['day__gte'] = start
        kwargs['day__lte'] = end
    else:
        kwargs['month__gte'] = start
        kwargs['month__lte'] = end

    if utility_id and client_id:
        # ConEd & NJNG don't offer utility_group granularity, but we still
        # want the detail page to work for them, so just return utility_group=''
        coned = Utility.objects.get(utilityabbrev__iexact='ConEd')
        njng = Utility.objects.get(utilityabbrev__iexact='NJNG')
        # Vectren's utility_group amounts do not sum to total daily demand
        if utility_id in (coned.utilityid, njng.utilityid, vectren.utilityid):
            pass
        else:
            # only select rows with utility group
            kwargs['utility_group__gt'] = ''
    else:
        kwargs['utility_group'] = ''

    if daily:
        demands = DailyForecastedDemand.objects.filter(**kwargs).order_by('day')
    else:
        demands = MonthlyForecastedDemand.objects.filter(**kwargs).order_by('month')

    client_utilities = set()
    demand_data = collections.defaultdict(dict)
    for fd in demands:
        clientabbrev = fd.client.clientabbrev
        utilityabbrev = fd.utility.utilityabbrev
        utilitygrp = fd.utility_group

        if daily:
            key = fd.day.strftime('%Y-%m-%d')
        else:
            key = fd.month.strftime('%Y-%m-%d')

        cu = '{} - {}'.format(clientabbrev, utilityabbrev)
        if utilitygrp:
            if utilitygrp == 'Transco-Z6 (NY)':
                utilitygrp = 'Transco'
            cu = '{} - {}'.format(cu, utilitygrp)

        dths = round(fd.decatherms)

        if daily and utility_id == vectren.utilityid:
            if utilitygrp:
                dths = (dths, fd.max_dths)

        demand_data[key][cu] = dths

        client_utilities.add(cu)
    # keys is a list of YYYY-MM-DD date strings represented in the data set
    keys = sorted(demand_data.keys())
    client_utilities = sorted(list(client_utilities))

    # client_utilities is list of strings: ['ALPHA - ConEd', 'ALPHA - PSEG']
    # months is list of strings: ['2015-06-01', '2015-07-01']
    # demand_data is dict {'2015-10-01': {'AST - ConEd': 92.074}}
    # not all clients are guaranteed to have data for all months

    if as_csv:
        fn = 'forecasted_demands_{}_{}.csv'.format(start, end)
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="{}"'.format(fn)
        writer = csv.writer(response)
        if daily:
            pretty_keys = [ datetime.datetime.strptime(k, '%Y-%m-%d').strftime('%d %b %Y') for k in keys ]
        else:
            pretty_keys = [ datetime.datetime.strptime(k, '%Y-%m-%d').strftime('%b %Y') for k in keys ]
        writer.writerow(['Client - Utility'] + pretty_keys)
        for cu in client_utilities:
            row = [cu]
            for key in keys:
                row.append(demand_data[key].get(cu, ''))
            writer.writerow(row)
        return response

    data = {
        "x-axis": keys,
    }
    for cu in client_utilities:
        data[cu] = []
        for key in keys:
            data[cu].append(demand_data[key].get(cu, ''))
    return HttpResponse(json.dumps(data), content_type="application/json")

I used a project called Geordi to profile the execution time of each function on my local machine, and found the following:

The django ORM version shows 3.86 seconds total execution time, of which 99.31% is spent in the view function itself, and 97.36% in the ORM’s related.__get__() method. Note the django framework overhead itself is responsible for only 0.51% of the total execution time.

So, rather than rely so heavily on django’s ORM for the /data endpoint, I replaced most of those queries with a couple of hand written queries, and the response time went down to between 20ms-40ms

@login_required
def forecastdemand_data(request):
    """returns data to populate timeseries plot @ /gas/forecastdemand/
    Data returned looks something like:
    {
        "x-axis": ["2016-01-01", "2016-02-01", "2016-03-01", ...],
        "Alpha - ConEd": [220, 240, 270, 250, 280],
        "Alpha - PSEG": [180, 150, 300, 70, 120],
        "Astral - ConEd": [200, 310, 150, 100, 180]
    }
    Unless there is a truthy value for a 'csv' GET parameter, in which case:
    'Client - Utility','YYYY-MM-DD','YYYY-MM-DD',...
    'Alpha - ConEd',220,240,270,...
    """
    as_csv = bool(request.GET.get('csv', ''))
    client_id = int(request.GET.get('client_id', 0))
    utility_id = int(request.GET.get('utility_id', 0))
    daily = int(request.GET.get('daily', 1))

    start = datetime.datetime.strptime(request.GET['from_date'], '%Y-%m-%d').date()
    (wday1st, days_in_month) = calendar.monthrange(start.year, start.month)

    if daily:
        end = start.replace(day=days_in_month)
    else:
        end = datetime.datetime.strptime(request.GET['to_date'], '%Y-%m-%d').date()

    cus = get_clientutilities()
    clients = [ get_client_by_id(cu['client_id']) for cu in cus.values() ]
    utilities = dict([ (cu['utility_id'], get_utility_by_id(cu['utility_id'])) for cu in cus.values() ]).values()

    if not request.user.is_staff:
        # filter client & utility lists to logged-in user's associated clients
        user_clients = ClientManager().clients_for_user(request.user)
        user_clients = [ uc for uc in user_clients if uc.gas ]
        user_client_ids = [ uc.client_id for uc in user_clients ]
        clients = [ c for c in clients if c['client_id'] in user_client_ids ]

        user_utilities = list(itertools.chain(*[ c['utilities'] for c in user_clients ] ))
        user_utility_ids = list(set([ u['utility_id'] for u in user_utilities ]))
        utilities = [ u for u in utilities if u['utility_id'] in user_utility_ids ]

    client_filter = list(set([ c['client_id'] for c in clients ]))
    if client_id and client_id in client_filter:
        client_filter = client_id

    utility_filter = list(set([ u['utility_id'] for u in utilities ]))
    if utility_id and utility_id in utility_filter:
        utility_filter = utility_id

    kwargs = {
        'client_id': client_filter,
        'utility_id': utility_filter,
        'date_start': start,
        'date_end': end,
        'daily': daily,
        'monthly': not daily,
    }

    # if both utility_id & client_id are passed, we want to query detail
    # records (LENGTH(utility_group) > 0), Except:
    #  * ConEd & NJNG don't offer utility_group granularity, but we still
    #    want the detail page to work for them, so just return utility_group=''
    ug_except = []
    #  * Vectren's utility_group amounts do not sum to total daily demand,
    #    so display of the sum row is special cased in JS on client side
    min_max_utilities = []
    for u in utilities:
        if u['utilityabbrev'].upper() == 'VCTN':
            min_max_utilities.append(u['utility_id'])
        if u['utilityabbrev'].upper() in ('CONED', 'NJNG', 'VCTN'):
            ug_except.append(u['utility_id'])

    if utility_id and client_id and not utility_id in ug_except:
        # only select rows with utility group
        kwargs['utility_groups'] = True
    else:
        kwargs['utility_groups'] = False

    demands = get_forecasteddemands(**kwargs)

    client_utilities = set()
    demand_data = collections.defaultdict(dict)
    for (cuids, cudata) in demands.iteritems():
        client_id = cuids[0]
        client = get_client_by_id(client_id)
        clientabbrev = client['clientabbrev']

        utility_id = cuids[1]
        utility = get_utility_by_id(utility_id)
        utilityabbrev = utility['utilityabbrev']

        # ['TETCO', '', 'TRANSCO']
        for utilitygrp in cudata.keys():

            cu = '{} - {}'.format(clientabbrev, utilityabbrev)
            if utilitygrp and utility_id and client_id:
                if utilitygrp == 'Transco-Z6 (NY)':
                    utilitygrp = 'Transco'
                cu = '{} - {}'.format(cu, utilitygrp)
            client_utilities.add(cu)

            # [('daily', '2016-10-01')]
            for (scope, datestr) in cudata[utilitygrp].keys():
                dths = cudata[utilitygrp][(scope, datestr)]

                # Vectren daily detail chart uses (min, max) split-cell display
                if daily and utilitygrp and utility_id in min_max_utilities:
                    # except for csv, which uses two rows
                    if as_csv:
                        demand_data[datestr][cu + ' MIN'] = round(dths[0])
                        demand_data[datestr][cu + ' MAX'] = round(dths[1])
                    else:
                        dths = (round(dths[0]), round(dths[1]))
                else:
                    demand_data[datestr][cu] = round(dths)

    # keys is a list of YYYY-MM-DD date strings represented in the data set
    keys = sorted(demand_data.keys())
    client_utilities = sorted(list(client_utilities))

    # client_utilities is list of strings: ['ALPHA - ConEd', 'ALPHA - PSEG']
    # keys is list of strings: ['2015-06-01', '2015-07-01']
    # demand_data is dict {'2015-10-01': {'AST - ConEd': 92.074}}
    # not all clients are guaranteed to have data for all months/keys

    if as_csv:
        fn = 'forecasted_demands_{}_{}.csv'.format(start, end)
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="{}"'.format(fn)
        writer = csv.writer(response)
        if daily:
            pretty_keys = [ datetime.datetime.strptime(k, '%Y-%m-%d').strftime('%d %b %Y') for k in keys ]
        else:
            pretty_keys = [ datetime.datetime.strptime(k, '%Y-%m-%d').strftime('%b %Y') for k in keys ]
        writer.writerow(['Client - Utility'] + pretty_keys)
        for cu in client_utilities:
            row = [cu]
            for key in keys:
                row.append(demand_data[key].get(cu, ''))
            writer.writerow(row)
        return response

    data = {
        "x-axis": keys,
    }
    for cu in client_utilities:
        data[cu] = []
        for key in keys:
            data[cu].append(demand_data[key].get(cu, ''))
    return HttpResponse(json.dumps(data), content_type="application/json")

The minimal ORM version (which does still use the ORM for a few queries, just not for the fetching of forecasted demands, and all of the redundant lookups of related objects) clocks in @ 0.034 seconds, of which the view function is responsible for only 23.15% (0.008013 clock time vs. 3.834297 clock time in the previous version). Where the previous version’s decorators:45:<lambda> method was responsible for only 0.51% of the total (0.01954 clock time), in this version it is 57.37% (with a virtually identical clock time of 0.01986).