Skip to content

Geo Coding

lookup_country(df, latitude_field_name='latitude', longitude_field_name='longitude', fields=None, country_fields=None)

Parameters:

Name Type Description Default
df DataFrame

A spark dataframe

required
latitude_field_name str

The latitude field name in the dataframe

'latitude'
longitude_field_name str

The logitude field name in the dataframe

'longitude'
fields [str]

What fields to return in the dataframe after proccesing, default is all columns

None
country_fields [str]

Specify what geocoder fields to return, default is all columns ("cc", "name", "admin1", "admin2", "country_name")

None

Returns:

Type Description
DataFrame

DataFrame containing the columns specified in country_fields or by default cc, name, admin1, admin2, country_name

Source code in delta_utils/geocoding.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def lookup_country(
    df: DataFrame,
    latitude_field_name: str = "latitude",
    longitude_field_name: str = "longitude",
    fields: Optional[List[str]] = None,
    country_fields: Optional[List[str]] = None,
) -> DataFrame:
    """
    Args:
        df (DataFrame): A spark dataframe
        latitude_field_name (str): The latitude field name in the dataframe
        longitude_field_name (str): The logitude field name in the dataframe
        fields ([str]): What fields to return in the dataframe after proccesing, default is all columns
        country_fields ([str]): Specify what geocoder fields to return, default is all columns ("cc", "name", "admin1", "admin2", "country_name")

    Returns:
        DataFrame containing the columns specified in country_fields or by default cc, name, admin1, admin2, country_name
    """
    if fields is not None:
        df = df.select(fields)
    if country_fields is None:
        country_fields = ["cc", "name", "admin1", "admin2", "country_name"]
    schema = T.StructType.fromJson(df.schema.jsonValue())
    for field in country_fields:
        schema = schema.add(field, T.StringType())
    return df.mapInArrow(
        partial(
            _lookup_country_partial,
            latitude_field_name=latitude_field_name,
            longitude_field_name=longitude_field_name,
            country_fields=country_fields,
        ),
        schema,
    )