digraph G {
0 [labelType="html" label="<br><b>AdaptiveSparkPlan</b><br><br>"];
1 [labelType="html" label="<br><b>CollectLimit</b><br><br>"];
subgraph cluster2 {
isCluster="true";
label="WholeStageCodegen (3)\n \nduration: 0 ms";
3 [labelType="html" label="<br><b>Project</b><br><br>"];
}
4 [labelType="html" label="<b>ObjectHashAggregate</b><br><br>number of output rows: 200<br>time in aggregation build: 37 ms<br>spill size: 0.0 B<br>number of sort fallback tasks: 1"];
5 [labelType="html" label="<b>AQEShuffleRead</b><br><br>number of partitions: 1<br>partition data size: 668.6 KiB<br>number of coalesced partitions: 1"];
6 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 31,919<br>local merged chunks fetched: 0<br>shuffle write time total (min, med, max (stageId: taskId))<br>11 ms (2 ms, 2 ms, 2 ms (stage 909.0: task 1248))<br>remote merged bytes read: 0.0 B<br>local merged blocks fetched: 0<br>corrupt merged block chunks: 0<br>remote merged reqs duration: 0 ms<br>remote merged blocks fetched: 0<br>records read: 31,919<br>local bytes read: 637.5 KiB<br>fetch wait time: 0 ms<br>remote bytes read: 0.0 B<br>merged fetch fallback count: 0<br>local blocks read: 5<br>remote merged chunks fetched: 0<br>remote blocks read: 0<br>data size total (min, med, max (stageId: taskId))<br>3.5 MiB (43.3 KiB, 823.7 KiB, 1136.8 KiB (stage 909.0: task 1247))<br>local merged bytes read: 0.0 B<br>number of partitions: 10<br>remote reqs duration: 0 ms<br>remote bytes read to disk: 0.0 B<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>637.5 KiB (9.7 KiB, 132.3 KiB, 211.4 KiB (stage 909.0: task 1247))"];
7 [labelType="html" label="<b>ObjectHashAggregate</b><br><br>number of output rows: 31,919<br>time in aggregation build total (min, med, max (stageId: taskId))<br>2.0 m (786 ms, 19.0 s, 51.0 s (stage 909.0: task 1249))<br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 909.0: task 1251))<br>number of sort fallback tasks: 5"];
subgraph cluster8 {
isCluster="true";
label="WholeStageCodegen (2)\n \nduration: total (min, med, max (stageId: taskId))\n2.0 m (778 ms, 19.0 s, 50.9 s (stage 909.0: task 1249))";
9 [labelType="html" label="<br><b>Project</b><br><br>"];
}
10 [labelType="html" label="<br><b>Project</b><br><br>"];
subgraph cluster11 {
isCluster="true";
label="WholeStageCodegen (1)\n \nduration: total (min, med, max (stageId: taskId))\n2.0 m (813 ms, 19.0 s, 50.9 s (stage 909.0: task 1249))";
12 [labelType="html" label="<b>Generate</b><br><br>number of output rows: 316,675"];
}
13 [labelType="html" label="<br><b>Project</b><br><br>"];
14 [labelType="html" label="<b>Filter</b><br><br>number of output rows: 5"];
15 [labelType="html" label="<b>Scan binaryFile </b><br><br>number of output rows: 5<br>number of files read: 5<br>metadata time: 0 ms<br>size of files read: 34.6 MiB"];
1->0;
3->1;
4->3;
5->4;
6->5;
7->6;
9->7;
10->9;
12->10;
13->12;
14->13;
15->14;
}
16
AdaptiveSparkPlan isFinalPlan=true
CollectLimit 200
Project [COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L, size(COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, true) AS COL_473A1712_AA45_4915_9347_8B9FD4D6C1CF#147906]
WholeStageCodegen (3)
ObjectHashAggregate(keys=[COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843], functions=[collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0), count(1)])
AQEShuffleRead coalesced
Exchange hashpartitioning(COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, 10), ENSURE_REQUIREMENTS, [plan_id=15114]
ObjectHashAggregate(keys=[COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843], functions=[partial_collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0), partial_count(1)])
Project [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, str_if_with_rule((((CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ,)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ,) END OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ;)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ;) END) OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, :)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, :) END) OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, .)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, .) END), str_extract(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, LEFT, dec_to_int(dec_operator(DIFFERENCE, array(cast(str_size(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742) as double), 1.0)))), COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742) AS COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
WholeStageCodegen (2)
Project [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, str_transform_case(str_remove_accent(get_json_object(COL_21E39881_03B7_4478_981D_330734A120C0#147728, $.term)), UPPER) AS COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742]
Generate explode(COL_C3F4AA07_85F2_497E_9135_2A1292466B75#147724), [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723], false, [COL_21E39881_03B7_4478_981D_330734A120C0#147728]
WholeStageCodegen (1)
Project [substring_index(path#147654, /, -1) AS COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, from_json(ArrayType(StringType,false), to_json(bin_content_words(pdf, content#147657), Some(Etc/UTC)), Some(Etc/UTC)) AS COL_C3F4AA07_85F2_497E_9135_2A1292466B75#147724]
Filter ((size(from_json(ArrayType(StringType,false), to_json(bin_content_words(pdf, content#147657), Some(Etc/UTC)), Some(Etc/UTC)), true) > 0) AND isnotnull(from_json(ArrayType(StringType,false), to_json(bin_content_words(pdf, content#147657), Some(Etc/UTC)), Some(Etc/UTC))))
FileScan binaryFile [path#147654,content#147657] Batched: false, DataFilters: [(size(from_json(ArrayType(StringType,false), to_json(bin_content_words(pdf, content#147657), Som..., Format: org.apache.spark.sql.execution.datasources.binaryfile.BinaryFileFormat@42bf79dc, Location: InMemoryFileIndex(5 paths)[file:/data/input/depot/binary/execution/454FFE78_6FB4_4FFA_AB5C_590D50..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<path:string,content:binary>
== Physical Plan ==
AdaptiveSparkPlan (22)
+- == Final Plan ==
CollectLimit (13)
+- * Project (12)
+- ObjectHashAggregate (11)
+- AQEShuffleRead (10)
+- ShuffleQueryStage (9), Statistics(sizeInBytes=3.5 MiB, rowCount=3.19E+4)
+- Exchange (8)
+- ObjectHashAggregate (7)
+- * Project (6)
+- Project (5)
+- * Generate (4)
+- Project (3)
+- Filter (2)
+- Scan binaryFile (1)
+- == Initial Plan ==
CollectLimit (21)
+- Project (20)
+- ObjectHashAggregate (19)
+- Exchange (18)
+- ObjectHashAggregate (17)
+- Project (16)
+- Project (15)
+- Generate (14)
+- Project (3)
+- Filter (2)
+- Scan binaryFile (1)
(1) Scan binaryFile
Output [2]: [path#147654, content#147657]
Batched: false
Location: InMemoryFileIndex [file:/data/input/depot/binary/execution/454FFE78_6FB4_4FFA_AB5C_590D50B78C20/current/BENARD_Clement_these_2021.pdf, ... 4 entries]
ReadSchema: struct<path:string,content:binary>
(2) Filter
Input [2]: [path#147654, content#147657]
Condition : ((size(from_json(ArrayType(StringType,false), to_json(bin_content_words(pdf, content#147657), Some(Etc/UTC)), Some(Etc/UTC)), true) > 0) AND isnotnull(from_json(ArrayType(StringType,false), to_json(bin_content_words(pdf, content#147657), Some(Etc/UTC)), Some(Etc/UTC))))
(3) Project
Output [2]: [substring_index(path#147654, /, -1) AS COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, from_json(ArrayType(StringType,false), to_json(bin_content_words(pdf, content#147657), Some(Etc/UTC)), Some(Etc/UTC)) AS COL_C3F4AA07_85F2_497E_9135_2A1292466B75#147724]
Input [2]: [path#147654, content#147657]
(4) Generate [codegen id : 1]
Input [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, COL_C3F4AA07_85F2_497E_9135_2A1292466B75#147724]
Arguments: explode(COL_C3F4AA07_85F2_497E_9135_2A1292466B75#147724), [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723], false, [COL_21E39881_03B7_4478_981D_330734A120C0#147728]
(5) Project
Output [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, str_transform_case(str_remove_accent(get_json_object(COL_21E39881_03B7_4478_981D_330734A120C0#147728, $.term)), UPPER) AS COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742]
Input [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, COL_21E39881_03B7_4478_981D_330734A120C0#147728]
(6) Project [codegen id : 2]
Output [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, str_if_with_rule((((CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ,)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ,) END OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ;)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ;) END) OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, :)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, :) END) OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, .)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, .) END), str_extract(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, LEFT, dec_to_int(dec_operator(DIFFERENCE, array(cast(str_size(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742) as double), 1.0)))), COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742) AS COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
Input [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742]
(7) ObjectHashAggregate
Input [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
Keys [1]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
Functions [2]: [partial_collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0), partial_count(1)]
Aggregate Attributes [2]: [buf#147919, count#147920L]
Results [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, buf#147921, count#147922L]
(8) Exchange
Input [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, buf#147921, count#147922L]
Arguments: hashpartitioning(COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, 10), ENSURE_REQUIREMENTS, [plan_id=15114]
(9) ShuffleQueryStage
Output [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, buf#147921, count#147922L]
Arguments: 0
(10) AQEShuffleRead
Input [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, buf#147921, count#147922L]
Arguments: coalesced
(11) ObjectHashAggregate
Input [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, buf#147921, count#147922L]
Keys [1]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
Functions [2]: [collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0), count(1)]
Aggregate Attributes [2]: [collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0)#147865, count(1)#147867L]
Results [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843 AS COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0)#147865 AS COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, count(1)#147867L AS COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L]
(12) Project [codegen id : 3]
Output [4]: [COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L, size(COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, true) AS COL_473A1712_AA45_4915_9347_8B9FD4D6C1CF#147906]
Input [3]: [COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L]
(13) CollectLimit
Input [4]: [COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L, COL_473A1712_AA45_4915_9347_8B9FD4D6C1CF#147906]
Arguments: 200
(14) Generate
Input [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, COL_C3F4AA07_85F2_497E_9135_2A1292466B75#147724]
Arguments: explode(COL_C3F4AA07_85F2_497E_9135_2A1292466B75#147724), [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723], false, [COL_21E39881_03B7_4478_981D_330734A120C0#147728]
(15) Project
Output [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, str_transform_case(str_remove_accent(get_json_object(COL_21E39881_03B7_4478_981D_330734A120C0#147728, $.term)), UPPER) AS COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742]
Input [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, COL_21E39881_03B7_4478_981D_330734A120C0#147728]
(16) Project
Output [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, str_if_with_rule((((CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ,)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ,) END OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ;)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, ;) END) OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, :)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, :) END) OR CASE WHEN isnull(EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, .)) THEN false ELSE EndsWith(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, .) END), str_extract(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742, LEFT, dec_to_int(dec_operator(DIFFERENCE, array(cast(str_size(COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742) as double), 1.0)))), COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742) AS COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
Input [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, COL_8024C6FB_6C34_46E0_91CB_695910BF0C1B#147742]
(17) ObjectHashAggregate
Input [2]: [COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
Keys [1]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
Functions [2]: [partial_collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0), partial_count(1)]
Aggregate Attributes [2]: [buf#147919, count#147920L]
Results [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, buf#147921, count#147922L]
(18) Exchange
Input [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, buf#147921, count#147922L]
Arguments: hashpartitioning(COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, 10), ENSURE_REQUIREMENTS, [plan_id=15087]
(19) ObjectHashAggregate
Input [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843, buf#147921, count#147922L]
Keys [1]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843]
Functions [2]: [collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0), count(1)]
Aggregate Attributes [2]: [collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0)#147865, count(1)#147867L]
Results [3]: [COL_BAC7322B_FA9B_430E_B73B_810A78B5874D#147843 AS COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, collect_set(COL_54F54A5F_FDA1_4A0A_BAB8_4D2903F0E87A#147723, 0, 0)#147865 AS COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, count(1)#147867L AS COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L]
(20) Project
Output [4]: [COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L, size(COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, true) AS COL_473A1712_AA45_4915_9347_8B9FD4D6C1CF#147906]
Input [3]: [COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L]
(21) CollectLimit
Input [4]: [COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L, COL_473A1712_AA45_4915_9347_8B9FD4D6C1CF#147906]
Arguments: 200
(22) AdaptiveSparkPlan
Output [4]: [COL_0A61C9DE_5CE3_4DA0_AE5A_F3BF3EC63939#147884, COL_863B601E_B157_49CF_87BA_7642EED15F21#147885, COL_893E73BD_760D_4B98_A476_91E3F0459760#147886L, COL_473A1712_AA45_4915_9347_8B9FD4D6C1CF#147906]
Arguments: isFinalPlan=true