Correctly handle requests with no response.

Fix ancient variable misspelling, which revealed a bunch of other bugs when response is None. Add test cases.
sboisson · Aug 21, 2012 · 938787e · 938787e
1 parent 4120b4f
commit 938787e
Show file tree

Hide file tree

Showing 12 changed files with 424 additions and 39 deletions.
diff --git a/main.py b/main.py
@@ -28,6 +28,8 @@
                   dest='pages', default=True)
 parser.add_option('-d', '--drop-bodies', action='store_true',
                   dest='drop_bodies', default=False)
+parser.add_option('-k', '--keep-unfulfilled-requests', action='store_true',
+                  dest='keep_unfulfilled', default=False)
 parser.add_option('-r', '--resource-usage', action='store_true',
                   dest='resource_usage', default=False)
 parser.add_option('--pad_missing_tcp_data', action='store_true',
@@ -40,6 +42,7 @@
 # copy options to settings module
 settings.process_pages = options.pages
 settings.drop_bodies = options.drop_bodies
+settings.keep_unfulfilled_requests = options.keep_unfulfilled
 settings.pad_missing_tcp_data = options.pad_missing_tcp_data
 settings.strict_http_parse_body = options.strict_http_parsing
 

diff --git a/pcap2har/http/flow.py b/pcap2har/http/flow.py
@@ -39,13 +39,19 @@ def __init__(self, tcpflow):
         # first request is the benchmark; responses before that
         # are irrelevant for now
         self.pairs = []
+        # determine a list of responses that we can match up with requests,
+        # padding the list with None where necessary.
         try:
             # find the first response to a request we know about,
             # that is, the first response after the first request
             first_response_index = find_index(
                 lambda response: response.ts_start > requests[0].ts_start,
                 responses
             )
+        except LookupError:
+            # no responses at all
+            pairable_responses = [None for i in requests]
+        else:
             # these are responses that match up with our requests
             pairable_responses = responses[first_response_index:]
             # if there are more requests than responses...
@@ -54,24 +60,20 @@ def __init__(self, tcpflow):
                 pairable_responses.extend(
                     [None for i in range(len(requests) - len(pairable_responses))]
                 )
-            # if there are more responses, we would just ignore them anyway,
-            # which zip does for us
-            # create MessagePair's
-            connected = False  # if conn. timing has been added to a request yet
-            for req, resp in zip(requests, responses):
-                if not req:
-                    logging.warning('Request is missing.')
-                    continue
-                if not connected and tcpflow.handshake:
-                    req.ts_connect = tcpflow.handshake[0].ts
-                    connected = True
-                else:
-                    req.ts_connect = req.ts_start
-                self.pairs.append(MessagePair(req, resp))
-        except LookupError:
-            # there were no responses after the first request
-            # there's nothing we can do
-            logging.warning('Request has no response.')
+        # if there are more responses, we would just ignore them anyway,
+        # which zip does for us
+        # create MessagePair's
+        connected = False  # if conn. timing has been added to a request yet
+        for req, resp in zip(requests, pairable_responses):
+            if not req:
+                logging.warning('Request is missing.')
+                continue
+            if not connected and tcpflow.handshake:
+                req.ts_connect = tcpflow.handshake[0].ts
+                connected = True
+            else:
+                req.ts_connect = req.ts_start
+            self.pairs.append(MessagePair(req, resp))
 
 
 class MessagePair(object):

diff --git a/pcap2har/httpsession.py b/pcap2har/httpsession.py
@@ -37,25 +37,27 @@ def __init__(self, request, response):
         self.pageref = None
         self.ts_start = int(request.ts_connect*1000)
         self.startedDateTime = datetime.utcfromtimestamp(request.ts_connect)
-        endedDateTime = datetime.utcfromtimestamp(response.ts_end)
-        self.total_time = ms_from_timedelta(
-            endedDateTime - self.startedDateTime # plus connection time, someday
-        )
         # calculate other timings
         self.time_blocked = -1
         self.time_dnsing = -1
         self.time_connecting = (
             ms_from_dpkt_time(request.ts_start - request.ts_connect))
         self.time_sending = (
             ms_from_dpkt_time(request.ts_end - request.ts_start))
-        self.time_waiting = (
-            ms_from_dpkt_time(response.ts_start - request.ts_end))
-        self.time_receiving = (
-            ms_from_dpkt_time(response.ts_end - response.ts_start))
-        # check if timing calculations are consistent
-        if (self.time_sending + self.time_waiting + self.time_receiving !=
-            self.total_time):
-            pass
+        if response is not None:
+            self.time_waiting = (
+                ms_from_dpkt_time(response.ts_start - request.ts_end))
+            self.time_receiving = (
+                ms_from_dpkt_time(response.ts_end - response.ts_start))
+            endedDateTime = datetime.utcfromtimestamp(response.ts_end)
+            self.total_time = ms_from_timedelta(
+                endedDateTime - self.startedDateTime
+            )
+        else:
+            # this can happen if the request never gets a response
+            self.time_waiting = -1
+            self.time_receiving = -1
+            self.total_time = -1
 
     def json_repr(self):
         '''
@@ -169,8 +171,9 @@ def __init__(self, packetdispatcher):
             # if msg.request has a referer, keep track of that, too
             if self.page_tracker:
                 entry.pageref = self.page_tracker.getref(entry)
-            # add it to the list
-            self.entries.append(entry)
+            # add it to the list, if we're supposed to keep it.
+            if entry.response or settings.keep_unfulfilled_requests:
+                self.entries.append(entry)
         self.user_agent = self.user_agents.dominant_user_agent()
         # handle DNS AFTER sorting
         # this algo depends on first appearance of a name

diff --git a/pcap2har/pagetracker.py b/pcap2har/pagetracker.py
@@ -68,11 +68,13 @@ def is_root_document(entry):
     guesses whether the entry is from the root document of a web page
     '''
     # guess based on media type
-    mt = entry.response.mediaType
-    if mt.type == 'text':
-        if mt.subtype in ['html', 'xhtml', 'xml']:
-            # probably...
-            return True
+    if entry.response:  # might be None
+        mt = entry.response.mediaType
+        if mt.type == 'text':
+            if mt.subtype in ['html', 'xhtml', 'xml']:
+                # probably...
+                return True
+    # else, guess by request url?
     return False
 
 

diff --git a/pcap2har/settings.py b/pcap2har/settings.py
@@ -7,3 +7,7 @@
 
 # Whether to pad missing data in TCP flows with 0 bytes
 pad_missing_tcp_data = False
+
+# Whether to keep requests with missing responses. Could break consumers
+# that assume every request has a response.
+keep_unfulfilled_requests = False
diff --git a/tests/README.txt b/tests/README.txt
@@ -35,3 +35,9 @@ pcapr.net.pcap
 A pageload of pcapr.net, an online pcap repository. Includes a redirect
 from pcapr.net to pcapr.net/home
 
+missing_response.pcap:
+A flow from fhs.pcap with one of the responses missing, to test -k functionality
+
+request_only.pcap:
+Flow with a request and nothing else, to handle a different failure case of -k
+
diff --git a/tests/missing_response.pcap b/tests/missing_response.pcap