apache spark – pyspark でデータフレームを選択中に null を取得する

プログラミングQA


私はspark 3.1を使用していて、JSONファイルを読み込もうとしています

以下のファイルのスキーマを次のように定義しました。

                    StructType([
                            StructField('search_metadata', MapType(StringType(),StringType())),
                            StructField('search_parameters', MapType(StringType(),StringType())),
                            StructField('search_information', MapType(StringType(),StringType())),
                            StructField('local_results',StructType([
                                                        StructField('position', StringType(), True),
                                                        StructField('title', StringType(), True),
                                                        StructField('place_id', StringType(), True),
                                                        StructField('data_id', StringType(), True),
                                                        StructField('data_cid', StringType(), True),
                                                        StructField('reviews_link', StringType(), True),
                                                        StructField('photos_link', StringType(), True),
                                                        StructField('gps_coordinates', MapType(StringType(),StringType()), True),
                                                        StructField('place_id_search', StringType(), True),
                                                        StructField('unclaimed_listing', BinaryType(), True),
                                                        StructField('type', StringType(), True),
                                                        StructField('address', StringType(), True),
                                                        StructField('open_state', StringType(), True),
                                                        StructField('hours', StringType(), True),
                                                        StructField('phone', MapType(StringType(),StringType()), True),
                                                        StructField('thumbnail', StringType(), True),
                                                            ]), True),
                            StructField('serpapi_pagination',MapType(StringType(),StringType())),
                            StructField('search_query', StringType(), True),
                            ]) 

私のJSONファイルは次のとおりです。

[{
    "search_metadata": {
        "id": "63560cab66440a949ade5d72",
        "status": "Success",
        "json_endpoint": "https://serpapi.com/searches/b6986ff9ff715b13/63560cab66440a949ade5d72.json",
        "created_at": "2022-10-24 03:55:23 UTC",
        "processed_at": "2022-10-24 03:55:23 UTC",
        "google_maps_url": "https://www.google.com/maps/search/WH?hl=en",
        "raw_html_file": "https://serpapi.com/searches/b6986ff9ff715b13/63560cab66440a949ade5d72.html",
        "total_time_taken": 1.91
    },
    "search_parameters": {
        "engine": "google_maps",
        "type": "search",
        "q": "WH",
        "google_domain": "google.com",
        "hl": "en"
    },
    "search_information": {
        "local_results_state": "Results for exact spelling",
        "query_displayed": "WH"
    },
    "local_results": [{
            "position": 1,
            "title": "WH International Casting, LLC",
            "place_id": "ChIJh0wvXcu_a4gRWuH-O1ltlPg",
            "data_id": "0x886bbfcb5d2f4c87:0xf8946d593bfee15a",
            "data_cid": "17912061847985381722",
            "reviews_link": "https://serpapi.com/search.json?data_id=0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a&engine=google_maps_reviews&hl=en",
            "photos_link": "https://serpapi.com/search.json?data_id=0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a&engine=google_maps_photos&hl=en",
            "gps_coordinates": {
                "latitude": 38.295865,
                "longitude": -85.73001099999999
            },
            "place_id_search": "https://serpapi.com/search.json?data=%214m5%213m4%211s0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a%218m2%213d38.295865%214d-85.73001099999999&engine=google_maps&google_domain=google.com&hl=en&type=place",
            "unclaimed_listing": true,
            "type": "Warehouse",
            "address": "260 America Pl Dr, Jeffersonville, IN 47130",
            "open_state": "Closed ⋅ Opens 8AM Mon",
            "hours": "Closed ⋅ Opens 8AM Mon",
            "operating_hours": {
                "sunday": "Closed",
                "monday": "8AM–4:30PM",
                "tuesday": "8AM–4:30PM",
                "wednesday": "8AM–4:30PM",
                "thursday": "8AM–4:30PM",
                "friday": "8AM–4:30PM",
                "saturday": "Closed"
            },
            "phone": "(812) 725-8029",
            "thumbnail": "https://lh5.googleusercontent.com/p/AF1QipPWDyyzxp1MG27vv3WVZbzy5WVI-Qh2u2jEDb-C=w122-h92-k-no"
        },
        {
            "position": 2,
            "title": "W.H. Smith Manor",
            "place_id": "ChIJ9584e22DXIgR5w2f2saKBOU",
            "data_id": "0x885c836d7b389ff7:0xe5048ac6da9f0de7",
            "data_cid": "16502467521268354535",
            "reviews_link": "https://serpapi.com/search.json?data_id=0x885c836d7b389ff7%3A0xe5048ac6da9f0de7&engine=google_maps_reviews&hl=en",
            "photos_link": "https://serpapi.com/search.json?data_id=0x885c836d7b389ff7%3A0xe5048ac6da9f0de7&engine=google_maps_photos&hl=en",
            "gps_coordinates": {
                "latitude": 36.581589799999996,
                "longitude": -83.6581731
            },
            "place_id_search": "https://serpapi.com/search.json?data=%214m5%213m4%211s0x885c836d7b389ff7%3A0xe5048ac6da9f0de7%218m2%213d36.581589799999996%214d-83.6581731&engine=google_maps&google_domain=google.com&hl=en&type=place",
            "unclaimed_listing": true,
            "type": "University department",
            "address": "184 Robertson Ave, Harrogate, TN 37752",
            "open_state": "Closed ⋅ Opens 8AM Mon",
            "hours": "Closed ⋅ Opens 8AM Mon",
            "operating_hours": {
                "sunday": "Closed",
                "monday": "8AM–4:30PM",
                "tuesday": "8AM–4:30PM",
                "wednesday": "8AM–4:30PM",
                "thursday": "8AM–4:30PM",
                "friday": "8AM–4:30PM",
                "saturday": "Closed"
            },
            "phone": "(423) 869-3611",
            "website": "http://lmunet.edu/",
            "thumbnail": "https://streetviewpixels-pa.googleapis.com/v1/thumbnail?panoid=mJwpOER-2yIbmD3xSwQ2pQ&cb_client=search.gws-prod.gps&w=80&h=92&yaw=307.97266&pitch=0&thumbfov=100"
        }

    ],
    "serpapi_pagination": {
        "next": "https://serpapi.com/search.json?engine=google_maps&google_domain=google.com&hl=en&q=WH&start=20&type=search"
    },
    "search_query": "WH.json"
}]

以下のようにいくつかの行を選択しようとしていますが、null 値を取得しています。スキーマ定義に問題があると想定しています。誰か見てください

 df = df.select(col('local_results'),col('local_results.position').alias('position'), 
                       col('local_results.title').alias('title'))
            df.show()

誰かがスキーマを修正して結果を表示するのを手伝ってくれますか?



Source link

コメント

タイトルとURLをコピーしました