Based on the definition of "Reliable broadband" stated above, NTIA includes technology codes listed below in the analysis of a location's max service level.
Based on the FCC definition of "low latency" in the BDC data specification, NTIA classifies service availability with latency above 100 milliseconds as unserved.
For more info, see FCC's Data Spec. for BDC Public Data Downloads
DataBricks PySpark/Microsoft SQL Server:
-- For each provider reported record, determine the service level (sub-query)
-- Then select the highest service level for each location
SELECT
location_id,
-- Get the max service level from the sub-query to determine the max service value, map it to an approperiate string value
(CASE
WHEN MAX(service_level) = 0 THEN 'Unserved' -- if the max value for a location is 0, it is Unserved
WHEN MAX(service_level) = 1 THEN 'Underserved' -- if the max value for a location is 1, it is Underserved
WHEN MAX(service_level) = 2 THEN 'Served' -- if the max value for a location is 2, it is Served
END) AS loc_max_service_level
FROM (
SELECT
*,
(CASE
-- When a provider level records meets the critera for Unserved, return 0
-- Unserved: Speeds below 25/3 Mbps or NULL OR without low_latency (low_latency=0)
WHEN low_latency = 0 THEN 0
WHEN max_advertised_download_speed IS NULL THEN 0
WHEN max_advertised_upload_speed IS NULL THEN 0
WHEN max_advertised_download_speed < 25 THEN 0
WHEN max_advertised_upload_speed < 3 THEN 0
-- When a provider level records meets the critera for Underserved, return 1
-- Underserved: Speeds at or above 25/3 Mbps, but Below 100/20 Mbps with low_latency (low_latency=1)
WHEN max_advertised_download_speed BETWEEN 25 AND 99.999 THEN 1
WHEN max_advertised_upload_speed BETWEEN 3 AND 19.999 THEN 1
-- When a provider level records meets the critera for Served, return 2
-- Served: Service Level at or above 100/20 Mbps with low_latency (low_latency=1)
WHEN max_advertised_download_speed >= 100 THEN 2
WHEN max_advertised_upload_speed >= 20 THEN 2
END) AS service_level
FROM bdc
) t1
-- Group by the location id to get one record per location
GROUP BY location_id
Python Pandas:
# Import the required python packages
import glob
import pandas as pd
import sys, os
# Display the environment python version
print("python versions: ", sys.version)
# Set the directory/folder path that contains only the zipped BDC CSV files (**Required input parameter**)
bdc_dir_path = r"D:\Pennsylvania"
# Set the path to the location Fabric FCC "Active" BSL CSV file (**Required input parameter**)
loc_fab_path = r"D:\LocationFabric_123122\FCC_Active_BSL_12312022_V2.1.csv"
# Set the output directory/folder path where the results will be exported (**Required input parameter**)
results_path = r"D:\results"
Technology codes constants:
# Set technology codes to use in the BDC servicel level analyis
# As a standard, NTIA includes technology codes (10, 40, 50, 71, 72) in the analysis of a location's max service level
tech_codes = [10, 40, 50, 71, 72]
# Get a list of all zipped BDC CSV files in the directory
zipped_csv_files = glob.glob(f"{bdc_dir_path}/*.zip")
# Iterate over the list of zipped CSV files to print each path and a file count
for number, zipped_file in enumerate(zipped_csv_files):
print(number + 1, zipped_file)
# Get the state FIPS code for the BDC files, the FIPS code will be used to query the location fabric
state_fips = zipped_csv_files[0].split('_')[1]
# Create a list of columns to read. By not reading all available columns, memory usage is optimized
use_columns = [
'location_id',
'block_geoid',
'technology',
'max_advertised_download_speed',
'max_advertised_upload_speed',
'low_latency',
]
# Set the BDC columns to treat as integers
int_cols = [
'technology', 'max_advertised_download_speed',
'max_advertised_upload_speed', 'low_latency'
]
# Create a list to store DataFrames
dataframes = []
# Iterate over the list of zipped CSV files
for zipped_file in zipped_csv_files:
# Read the zipped BDC CSV file into a DataFrame, ensure to only read the selected state records
# All columns will be of type "object" (str) which ensures pandas.concat will not incure datatype errors
_df = pd.read_csv(zipped_file,
compression="zip",
dtype=str,
usecols=use_columns
)[lambda df: df['block_geoid'].str.startswith(state_fips.zfill(2))]
# Set columns to treat as integers
_df[int_cols] = _df[int_cols].astype(int)
# Filter by specified technology codes
_df = _df[_df['technology'].isin(tech_codes)]
# Add the DataFrame to the list of DataFrames
if len(_df) > 0:
dataframes.append(_df)
# Concatenate the list of BDC DataFrames into a single BDC DataFrame
bdc_df = pd.concat(dataframes)
# Clear initial file read dataframes to conserve resources/memeory
for _df in dataframes:
del(_df)
del(dataframes)
# Print a count of BDC records for the state
f"Total State BDC records: {len(bdc_df):,}"
# Describe the data types for each column in the BDC dataset (optional)
bdc_df.info()
# Create a list of columns to read. By not reading all columns, memory is optimized
use_columns = [
'location_id',
'block_geoid',
'latitude',
'longitude',
'bsl_flag',
]
# Read the un-zipped Location Fabric CSV file into a DataFrame, ensure to only read the selected state records
state_loc_fab_df = pd.read_csv(
loc_fab_path,
dtype=str,
usecols=use_columns
)[lambda df: df['block_geoid'].str.startswith(state_fips.zfill(2))]
# Filter to ensure only BSL's are included in the analysis
state_loc_fab_df = state_loc_fab_df[state_loc_fab_df['bsl_flag']=='True']
# Print a count of BSL records for the state
f"Total BSL's: {len(state_loc_fab_df):,}"
# Describe the data types for each column in the Location Fabric dataset (optional)
state_loc_fab_df.info()
# Some BSL's may not have a service level reported within the BDC data
# Left join the Location Fabric BSL to the BDC dataset, Nulls will be labled as "Unserved"
df = state_loc_fab_df.merge(bdc_df, on='location_id', how='left', suffixes=('', '_df2_'))
# Remove columns duplicated in the join
dup_cols = [c for c in df.columns if c.endswith('_df2_')]
df = df.drop(columns=dup_cols)
# Clear up some memmory by removing the BDC and location fabric dataframes
del(bdc_df)
del(state_loc_fab_df)
# Print a count of join records for the state
f"Total join records: {len(df):,}"
# Describe the data types for each column in the joined dataset (optional)
df.info()
def calculate_service_level(row):
"""Function calculates the service level based on service level criteria.
The enumerated service level is calculated for a given row in a pandas dataframe/series.
An enumerated service level rank value is returned.
Parameters:
pd.Series:
A Pandas row of data containing the columns:
-'max_advertised_download_speed'
-'max_advertised_upload_speed'
-'low_latency'
Returns:
int:
An enumerated service level calculation results:
-1: 'Service Level Error'
0: 'Unserved'
1: 'Underserved'
2: 'Served'
Example:
>>> df['enum_service_level'] = df.apply(calculate_service_level, axis=1)
"""
#------------------------------------------------------------------------------------------------------
# Check if download or upload speed values are missing
if (
pd.isna(row['max_advertised_download_speed'])
or pd.isna(row['max_advertised_upload_speed'])
or pd.isna(row['low_latency'])
):
return 0 # If speed values are missing, label as (0) "Unserved"
#------------------------------------------------------------------------------------------------------
# Check if the location has low latency
elif row['low_latency'] == 0:
return 0 # If Low latency is False (0), label as (0) "Unserved"
#------------------------------------------------------------------------------------------------------
# Check download and upload speed conditions for "Unserved" category
elif (
row['max_advertised_download_speed'] < 25
or row['max_advertised_upload_speed'] < 3
):
return 0 # If speeds are below 25/3, label as (0) "Unserved"
#------------------------------------------------------------------------------------------------------
# Check download and upload speed conditions for "Underserved" category
elif (
25 <= row['max_advertised_download_speed'] < 100
or 3 <= row['max_advertised_upload_speed'] < 20
):
return 1 # If speeds are at or above 25/3, but less than 100/20, label as (1) "Underserved"
#------------------------------------------------------------------------------------------------------
# Check download and upload speed conditions for "Served" category
elif (
row['max_advertised_download_speed'] >= 100
or row['max_advertised_upload_speed'] >= 20
):
return 2 # If speeds are equal to or above 100/20, label as (2) "Served"
#------------------------------------------------------------------------------------------------------
# with none of the criteria met, label with an (-1) "Service Level Error"
else:
return -1
# Apply the calculate_service_level function to each row in the DataFrame
# This creates a new column called "enum_service_level"
df['enum_service_level'] = df.apply(calculate_service_level, axis=1)
# Group by each BSL to obtain the highest reported service level
# This determines the max service level for each BSL
#
# The additional columns below are retained for additional subquery capablities.
# However, these are not required for the service level analysis and do increase
# the dataframe memory/size:
# - Optional Columns: 'block_geoid', 'latitude', 'longitude', 'bsl_flag'
bsl_max_srvc_lvls_df = df.groupby(['location_id',
'block_geoid',
'latitude',
'longitude',
'bsl_flag']
)['enum_service_level'].max().reset_index()
# Map the service level labels for each BSL
svc_lvl_map = {-1: 'Service Level Error', 0: 'Unserved', 1: 'Underserved', 2: 'Served'}
bsl_max_srvc_lvls_df['service_level'] = bsl_max_srvc_lvls_df['enum_service_level'].map(svc_lvl_map)
# Print a count of max service level records for the state
f"Total max service level records: {len(bsl_max_srvc_lvls_df):,}"
# Show the max service levels per location
# This will show a sample (n) records of the BSL max service levels
# Location IDs along with their corresponding maximum service levels
# Set the number of rows to display
row_qty = 10
# Ensure the specified number of rows can be displayed
if row_qty > pd.get_option('display.max_rows'):
pd.set_option('display.max_rows', row_qty)
# Display the first (n) records
bsl_max_srvc_lvls_df.head(row_qty)
# Export the service level results for every location, while ensuring
# that the files are no larger than 1 mil records so that excel can open
# and display all records.
# Set the base path to include the file name prefix
base_out_path = os.path.join(results_path, f'{state_fips}_service_level_results')
# get the dataframe row count
row_cnt = len(bsl_max_srvc_lvls_df)
# Iterate over a chunks of the dataframe and export the results.
print(f'Exporting results to:')
for i in range(0, row_cnt, 1000000):
# Create a slice of the dataframe for each chunk number
df_chunk = bsl_max_srvc_lvls_df[i:(i + 1000000)]
# Export the slice to a CSV file.
file_path = f'{base_out_path}_rows({i+1}_{i+len(df_chunk)}).csv'
df_chunk.to_csv(file_path, index=False)
print(f'\t- {file_path}')
# Get counts of unique service level values in the service_level column
val_cnts = (bsl_max_srvc_lvls_df['service_level'].value_counts())
# Convert the series to a dataframe
val_cnts_df = pd.DataFrame(val_cnts).reset_index()
# Set the dataframe column names
val_cnts_df.columns= ['service_level','location_qty']
# Show a state level count summary of BSL's per service level
display(val_cnts_df)
# Export the service level summary table to csv
file_path = os.path.join(results_path, f'{state_fips}_service_level_summary.csv')
print(f'Exporting service level summary results to:\n\t- {file_path}')
val_cnts_df.to_csv(file_path, index=False)