source: ReferenceDesigns/w3_802.11/python/examples/log/log_anonymize.py

Last change on this file was 6320, checked in by chunter, 5 years ago

1.8.0 release wlan-exp

File size: 11.5 KB
Line 
1"""
2------------------------------------------------------------------------------
3Mango 802.11 Reference Design Experiments Framework - Log File Anonymizer
4------------------------------------------------------------------------------
5License:   Copyright 2014-2019, Mango Communications. All rights reserved.
6           Distributed under the WARP license (http://warpproject.org/license)
7------------------------------------------------------------------------------
8This script uses the wlan_exp Log framework to anonymize a given hdf5
9log file that contains data assocated with an experiment utilizing the
10802.11 reference design and 802.11 Reference Design Experiments Framework.
11
12Hardware Setup:
13    - None.  Anonymizing log data can be done completely off-line
14
15Required Script Changes:
16    - None.  Script requires filename of file to be anonymized to be
17      passed in on the command line.
18
19Description:
20    This script parses the log file, removes any personally identifiable
21    information from the log and write the resulting log data to a new
22    file.  The personally identifiable information that is removed:
23        - Any MAC address that is not in the following categories:
24            - Broadcast Address (ff-ff-ff-ff-ff-ff)
25            - IP v4 Multicast Address (01-00-5E-xx-xx-xx)
26            - IP v6 Multicast Address (33-33-xx-xx-xx-xx)
27            - WARP node (40-D8-55-04-2x-xx-xx)
28        - Any payloads from transmissions / receptions
29        - Any commands
30        - Hostnames in the station info
31------------------------------------------------------------------------------
32"""
33import sys
34import os
35import time
36import struct
37
38import wlan_exp.log.util as log_util
39import wlan_exp.log.util_hdf as hdf_util
40import wlan_exp.log.entry_types as entry_types
41
42#-----------------------------------------------------------------------------
43# Global Variables
44#-----------------------------------------------------------------------------
45
46# Global flag to print performance data
47print_time   = False
48
49all_addrs    = list()
50addr_idx_map = dict()
51
52
53
54#-----------------------------------------------------------------------------
55# Anonymizer Methods
56#-----------------------------------------------------------------------------
57def do_replace_addr(addr):
58    """Determine if the MAC address should be replaced."""
59    do_replace = True
60
61    # This list should stay in sync with wlan_exp.util mac_addr_desc_map
62
63    # Don't replace the broadcast address (FF-FF-FF-FF-FF-FF)
64    if(addr == (0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)):
65        do_replace = False
66
67    # Don't replace multicast IP v4 addresses (01-00-5E-00-00-00 to -7F-FF-FF)
68    #   http://technet.microsoft.com/en-us/library/cc957928.aspx
69    if(addr[0:3] == (0x01, 0x00, 0x5E) and (addr[3] <= 0x7F)):
70        do_replace = False
71
72    # Don't replace multicast IP v6 addresses (33-33-xx-xx-xx-xx)
73    #   http://www.cavebear.com/archive/cavebear/Ethernet/multicast.html
74    if(addr[0:2] == (0x33, 0x33)):
75        do_replace = False
76
77    # Don't replace Mango addresses (40-D8-55-04-2x-xx)
78    if(addr[0:4] == (0x40, 0xD8, 0x55, 0x04) and ((addr[4] & 0x20) == 0x20)):
79        do_replace = False
80
81    return do_replace
82
83
84def addr_to_replace(addr, byte_index, addr_idx_map):
85    """Build map of all indexes for a particular address so they can all be
86    replaced by the same value.
87    """
88    global all_addrs
89    if(do_replace_addr(addr)):
90        if(addr not in all_addrs):
91            all_addrs.append(addr)
92        if addr not in addr_idx_map.keys():
93            addr_idx_map[addr] = [byte_index,]
94        else:
95            addr_idx_map[addr].append(byte_index)
96    return
97
98
99def log_anonymize(filename):
100    """Anonymize the log."""
101    global all_addrs
102
103    # Get the log_data from the file
104    log_bytes = bytearray(hdf_util.hdf5_to_log_data(filename=filename))
105
106    # Get the raw_log_index from the file
107    raw_log_index = hdf_util.hdf5_to_log_index(filename=filename)
108
109    # Get the user attributes from the file
110    log_attr_dict  = hdf_util.hdf5_to_attr_dict(filename=filename)
111
112
113    # Generate the index of log entry locations sorted by log entry type
114    #    Merge the Rx / Tx subtypes that can be processed together
115    log_index      = log_util.filter_log_index(raw_log_index,
116                                               merge={'RX_OFDM': ['RX_OFDM', 'RX_OFDM_LTG'],
117                                                      'TX_HIGH': ['TX_HIGH', 'TX_HIGH_LTG'],
118                                                      'TX_LOW' : ['TX_LOW', 'TX_LOW_LTG']})
119
120    # Re-initialize the address-byteindex map per file using the running
121    #   list of known MAC addresses
122    addr_idx_map = dict()
123    for addr in all_addrs:
124        addr_idx_map[addr] = list()
125
126    log_util.print_log_index_summary(log_index, "Log Index Summary (merged):")
127
128
129    #---------------------------------------------------------------------
130    # Step 1: Build a dictionary of all MAC addresses in the log, then
131    #   map each addresses to a unique anonymous address
132    #   Uses tuple(bytearray slice) since bytearray isn't hashable as-is
133    #
134    print("Anonmyizing file step 1 ...")
135
136    start_time = time.time()
137
138    #----------------------------------
139    # Rx DSSS entries
140    #
141    try:
142        print("    Anonmyizing {0} RX_DSSS entries".format(len(log_index['RX_DSSS'])))
143
144        pyld_start = struct.calcsize(''.join(
145                entry_types.entry_rx_dsss.get_field_struct_formats()[:-1])
146        )
147
148        for idx in log_index['RX_DSSS']:
149            # 6-byte addresses at offsets 4, 10, 16 in the mac_payload
150            for o in (4, 10, 16):
151                addr_to_replace(tuple(log_bytes[idx+pyld_start+o:idx+pyld_start+o+6]), idx+pyld_start+o, addr_idx_map)
152    except KeyError:
153        pass
154
155    if print_time:
156        print("        Time = {0:.3f}s".format(time.time() - start_time))
157
158    #----------------------------------
159    # Rx OFDM entries
160    #
161    try:
162        print("    Anonmyizing {0} RX_OFDM entries".format(len(log_index['RX_OFDM'])))
163
164        pyld_start = struct.calcsize(''.join(
165                entry_types.entry_rx_ofdm.get_field_struct_formats()[:-1])
166        )
167
168        for idx in log_index['RX_OFDM']:
169            # 6-byte addresses at offsets 4, 10, 16 in the mac_payload
170            for o in (4, 10, 16):
171                addr_to_replace(tuple(log_bytes[idx+pyld_start+o:idx+pyld_start+o+6]), idx+pyld_start+o, addr_idx_map)
172    except KeyError:
173        pass
174
175    if print_time:
176        print("        Time = {0:.3f}s".format(time.time() - start_time))
177
178    #----------------------------------
179    # Tx entries
180    #
181    try:
182        print("    Anonmyizing {0} TX_HIGH entries".format(len(log_index['TX_HIGH'])))
183
184        pyld_start = struct.calcsize(''.join(
185                entry_types.entry_tx_high.get_field_struct_formats()[:-1])
186        )
187
188        for idx in log_index['TX_HIGH']:
189            # 6-byte addresses at offsets 4, 10, 16 in the mac_payload
190            for o in (4, 10, 16):
191                addr_to_replace(tuple(log_bytes[idx+pyld_start+o:idx+pyld_start+o+6]), idx+pyld_start+o, addr_idx_map)
192    except KeyError:
193        pass
194
195    if print_time:
196        print("        Time = {0:.3f}s".format(time.time() - start_time))
197
198    #----------------------------------
199    # Tx Low entries
200    #
201    try:
202        print("    Anonmyizing {0} TX_LOW entries".format(len(log_index['TX_LOW'])))
203
204        pyld_start = struct.calcsize(''.join(
205                entry_types.entry_tx_low.get_field_struct_formats()[:-1])
206        )
207
208        for idx in log_index['TX_LOW']:
209            # 6-byte addresses at offsets 40, 46, 52
210            for o in (4, 10, 16):
211                addr_to_replace(tuple(log_bytes[idx+pyld_start+o:idx+pyld_start+o+6]), idx+pyld_start+o, addr_idx_map)
212    except KeyError:
213        pass
214
215    if print_time:
216        print("        Time = {0:.3f}s".format(time.time() - start_time))
217
218
219    #---------------------------------------------------------------------
220    # Step 2: Enumerate actual MAC addresses and their anonymous replacements
221    #
222    print("Anonmyizing file step 2 ...")
223
224    print("    Enumerate MAC addresses and their anonymous replacements")
225
226    addr_map = dict()
227    for ii,addr in enumerate(all_addrs):
228        # Address should not have a first octet that is odd, as this indicates
229        # the address is multicast.  Hence, use 0xFE as the first octet.
230        #
231        # Due to FCS errors, the number of addresses in a log file is
232        # potentially large.  Therefore, the anonymizer supports 2^24 unique
233        # addresses.
234        #
235        anon_addr = (0xFE, 0xFF, 0xFF, (ii//(256**2)), ((ii//256)%256), (ii%256))
236        addr_map[addr] = anon_addr
237
238    if print_time:
239        print("        Time = {0:.3f}s".format(time.time() - start_time))
240
241
242    #---------------------------------------------------------------------
243    # Step 3: Replace all MAC addresses in the log
244    #
245    print("Anonmyizing file step 3 ...")
246
247    print("    Replace all MAC addresses in the log")
248
249    for old_addr in addr_idx_map.keys():
250        new_addr = bytearray(addr_map[old_addr])
251        for byte_idx in addr_idx_map[old_addr]:
252            log_bytes[byte_idx:byte_idx+6] = new_addr
253
254    if print_time:
255        print("        Time = {0:.3f}s".format(time.time() - start_time))
256
257
258    #---------------------------------------------------------------------
259    # Step 4: Other annonymization steps
260    #
261    print("Anonmyizing file step 4 ...")
262
263    print("    Remove all payloads")
264
265    # Overwrite all payloads with zeros
266    try:
267        for key in log_index.keys():
268            log_util.overwrite_payloads(log_bytes, log_index[key])
269    except:
270        pass
271
272    if print_time:
273        print("        Time = {0:.3f}s".format(time.time() - start_time))
274
275
276    #---------------------------------------------------------------------
277    # Write output files
278    #
279
280    # Write the modified log to a new HDF5 file
281    (fn_fldr, fn_file) = os.path.split(filename)
282
283    # Find the last '.' in the file name and classify everything after that as the <ext>
284    ext_i = fn_file.rfind('.')
285    if (ext_i != -1):
286        # Remember the original file extension
287        fn_ext  = fn_file[ext_i:]
288        fn_base = fn_file[0:ext_i]
289    else:
290        fn_ext  = ''
291        fn_base = fn_file
292
293    newfilename = os.path.join(fn_fldr, fn_base + "_anon" + fn_ext)
294
295    print("Writing new file {0} ...".format(newfilename))
296
297    # Copy any user attributes to the new anonymized file
298    hdf_util.log_data_to_hdf5(log_bytes, newfilename, attr_dict=log_attr_dict)
299
300    return
301
302
303#-----------------------------------------------------------------------------
304# Main
305#-----------------------------------------------------------------------------
306
307if __name__ == '__main__':
308    if(len(sys.argv) < 2):
309        print("ERROR: must provide at least one log file input")
310        sys.exit()
311    else:
312        for filename in sys.argv[1:]:
313            # Ensure the log file actually exists; Print an error and continue to the next file.
314            if(not os.path.isfile(filename)):
315                print("\nERROR: File {0} not found".format(filename))
316            else:
317                print("\nAnonymizing file '{0}' ({1:5.1f} MB)\n".format(filename, (os.path.getsize(filename)/1E6)))
318                log_anonymize(filename)
319
320    print("\nMAC Address Mapping:")
321    for ii,addr in enumerate(all_addrs):
322        anon_addr = (0xFE, 0xFF, 0xFF, (ii//(256**2)), ((ii//256)%256), (ii%256))
323        print("%2d: %02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x" %
324            (ii, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5],
325             anon_addr[0], anon_addr[1], anon_addr[2], anon_addr[3], anon_addr[4], anon_addr[5]))
Note: See TracBrowser for help on using the repository browser.