Published on 2020-12-04 4 minutes

Postgresql Versus Greenplum

→ Server configuration

a dataset of 30M/60M and 100M rows
16GO RAM
500GO SSD
8 core

→ Postgresql

version 13.1
madlib 1.17

configured with pgtune

max_connections = 20
shared_buffers = 4GB
effective_cache_size = 12GB
maintenance_work_mem = 2GB
checkpoint_completion_target = 0.9
wal_buffers = 16MB
default_statistics_target = 500
random_page_cost = 1.1
effective_io_concurrency = 200
work_mem = 26214kB
min_wal_size = 4GB
max_wal_size = 16GB
max_worker_processes = 8
max_parallel_workers_per_gather = 4
max_parallel_workers = 8
max_parallel_maintenance_workers = 4

→ Greenplum

version 6.12
madlib 1.17
4 segments
default configurations

→ Overall results

→ conclusion

From this preliminary results, Greeplum is superior to postgreSQL on a standalone server, which is very surprining because it is supposed to work on a distrinbuted cluster.

Greenplum is way faster than postgres in the context of datawarhousing and managing large tables in every domains: insert/update/delete/select. It provides a set of tools such gptext, madlib, map-reduce, pl/python-R and PFX which looks relevant. Also it provides 'queues' to makes multi-users collaborate better.

While postgreSQL is effective on a 30M rows table, it struggles to handle larger tables.

→ SQL

→ Postgresql

create table labevents(
   row_id      integer
 , subject_id  integer
 , hadm_id     integer
 , itemid      integer
 , charttime   timestamp(0) without time zone
 , value       character varying(200)
 , valuenum    double precision
 , valueuom    character varying(20)
 , flag        character varying(20)
 , mimic_id    integer
 ) ;
CREATE TABLE

warehouse=# \copy labevents from 'labevents.csv' csv ;
COPY 27854055
Time: 353750.640 ms (05:53.751)

warehouse=# select count(*) c, c1.flag from labevents as c1   group by c1.flag order by c  desc limit 10 ;
    c     |   flag
----------+----------
 18009889 |
  9779594 | abnormal
    64572 | delta
(3 rows)

Time: 939.223 ms

warehouse=# select c1.row_id, row_number() over(partition by c1.itemid order by c1.row_id desc) rk from labevents as c1   limit 10;
  row_id  | rk
----------+----
 27906665 |  1
 27906621 |  2
 27906612 |  3
 27906561 |  4
 27906550 |  5
 27906507 |  6
 27906370 |  7
 27906325 |  8
 27906270 |  9
 27906257 | 10
(10 rows)

Time: 13714.732 ms (00:13.715)

warehouse=# insert into labevents select * from labevents;
INSERT 0 27854055
Time: 423631.155 ms (07:03.631)

warehouse=# select count(*) c, c1.flag from labevents as c1   group by c1.flag order by c  desc limit 10 ;
    c     |   flag
----------+----------
 36019778 |
 19559188 | abnormal
   129144 | delta
(3 rows)

Time: 5295.608 ms (00:05.296)

warehouse=# select c1.row_id, row_number() over(partition by c1.itemid order by c1.row_id desc) rk from labevents as c1   limit 10;
  row_id  | rk
----------+----
 27906665 |  1
 27906665 |  2
 27906621 |  3
 27906621 |  4
 27906612 |  5
 27906612 |  6
 27906561 |  7
 27906561 |  8
 27906550 |  9
 27906550 | 10
(10 rows)

Time: 126347.700 ms (02:06.348)

warehouse=# select count(*) c, c1.flag from labevents as c1 join labevents c2 on c1.row_id = c2.row_id  group by c1.flag order by c  desc limit 10 ;
    c     |   flag
----------+----------
 72039556 |
 39118376 | abnormal
   258288 | delta
(3 rows)

Time: 234570.663 ms (03:54.571)

warehouse=# insert into labevents select * from labevents;
INSERT 0 55708110
Time: 1201961.089 ms (20:01.961)

warehouse=# select count(*) c, c1.flag from labevents as c1   group by c1.flag order by c  desc limit 10 ;
    c     |   flag
----------+----------
 72039556 |
 39118376 | abnormal
   258288 | delta
(3 rows)

Time: 7277.858 ms (00:07.278)

warehouse=# select c1.row_id, row_number() over(partition by c1.itemid order by c1.row_id desc) rk from labevents as c1   limit 10;
  row_id  | rk
----------+----
 27906665 |  1
 27906665 |  2
 27906665 |  3
 27906665 |  4
 27906621 |  5
 27906621 |  6
 27906621 |  7
 27906621 |  8
 27906612 |  9
 27906612 | 10
(10 rows)

Time: 465615.775 ms (07:45.616)

warehouse=# select count(*) c, c1.flag from labevents as c1 join labevents c2 on c1.row_id = c2.row_id  group by c1.flag order by c  desc limit 10 ;
     c     |   flag
-----------+----------
 288158224 |
 156473504 | abnormal
   1033152 | delta
(3 rows)

Time: 494615.953 ms (08:14.616)

warehouse=# delete from labevents where row_id < 5000000;
DELETE 19952316
Time: 157689.975 ms (02:37.690)

warehouse=# update  labevents set flag = 'new flag' where row_id < 10000000;
UPDATE 19949040
Time: 810058.857 ms (13:30.059)

create unlogged table tbl_range (data1 text, data2 text, randomnumber double precision);
insert into tbl_range
( select
    ('{a,b,c,d,e,f}'::text[])[1 + floor(random() * 6)] as data1
    ,('{g,h,i,j,k,l}'::text[])[1 + floor(random() * 6)] as data2
    ,random() * 1000 as randomnumber
from generate_series(1,400000000));
SELECT madlib.pivot('tbl_range', 'pivout', 'data1', 'data2', 'randomnumber', 'sum,max');

→ Greenplum

warehouse=# create table labevents(
 row_id      integer
 ,subject_id  integer
 ,hadm_id     integer
 ,itemid      integer
 ,charttime   timestamp(0) without time zone
 ,value       character varying(200)
 ,valuenum    double precision
 ,valueuom    character varying(20)
 ,flag        character varying(20)
 ,mimic_id    integer
 ) with (appendonly=true, orientation=column, compresstype=ZSTD );
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'row_id' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE

warehouse=# \copy labevents from '/home/nparis/labevents.csv' csv ;
COPY 27854055
Time: 80878,637 ms

warehouse=# select count(*) c, c1.flag from labevents as c1   group by c1.flag order by c  desc limit 10 ;
    c     |   flag
----------+----------
 18009889 |
  9779594 | abnormal
    64572 | delta
(3 rows)

Time: 1164,303 ms

warehouse=# select c1.row_id, row_number() over(partition by c1.itemid order by c1.row_id desc) rk from labevents as c1   limit 10;
  row_id  | rk
----------+----
 27907576 |  1
 27907382 |  2
 27901097 |  3
 27901054 |  4
 27900783 |  5
 27897288 |  6
 27896669 |  7
 27894567 |  8
 27892281 |  9
 27884982 | 10
(10 rows)

Time: 12569,391 ms

warehouse=# insert into labevents select * from labevents;
INSERT 0 27854055
Time: 79361,173 ms

warehouse=# select count(*) c, c1.flag from labevents as c1   group by c1.flag order by c  desc limit 10 ;
    c     |   flag
----------+----------
 36019778 |
 19559188 | abnormal
   129144 | delta
(3 rows)

Time: 2353,157 ms

warehouse=# select c1.row_id, row_number() over(partition by c1.itemid order by c1.row_id desc) rk from labevents as c1   limit 10;
  row_id  | rk
----------+----
 27906665 |  1
 27906665 |  2
 27906621 |  3
 27906621 |  4
 27906612 |  5
 27906612 |  6
 27906561 |  7
 27906561 |  8
 27906550 |  9
 27906550 | 10
(10 rows)

Time: 25409,642 ms

warehouse=# select count(*) c, c1.flag from labevents as c1 join labevents c2 on c1.row_id = c2.row_id  group by c1.flag order by c  desc limit 10 ;
    c     |   flag
----------+----------
 72039556 |
 39118376 | abnormal
   258288 | delta
(3 rows)

Time: 101654,716 ms

warehouse=# insert into labevents select * from labevents;
INSERT 0 55708110
Time: 112203,521 ms

warehouse=# select count(*) c, c1.flag from labevents as c1   group by c1.flag order by c  desc limit 10 ;
    c     |   flag
----------+----------
 72039556 |
 39118376 | abnormal
   258288 | delta
(3 rows)

Time: 3941,551 ms

warehouse=# select c1.row_id, row_number() over(partition by c1.itemid order by c1.row_id desc) rk from labevents as c1   limit 10;
  row_id  | rk
----------+----
 27906665 |  1
 27906665 |  2
 27906665 |  3
 27906665 |  4
 27906621 |  5
 27906621 |  6
 27906621 |  7
 27906621 |  8
 27906612 |  9
 27906612 | 10
(10 rows)

Time: 65279,232 ms

warehouse=# select count(*) c, c1.flag from labevents as c1 join labevents c2 on c1.row_id = c2.row_id  group by c1.flag order by c  desc limit 10 ;
     c     |   flag
-----------+----------
 288158224 |
 156473504 | abnormal
   1033152 | delta
(3 rows)

Time: 188816,067 ms

warehouse=# delete from labevents where row_id < 5000000;
DELETE 19952316
Time: 8930,371 ms

warehouse=# update  labevents set flag = 'new flag' where row_id < 10000000;
UPDATE 19949040
Time: 32977,965 ms

nparis@gnubuntu:~/bin$ /opt/greenplum-db-6-6.12.1/bin/gpload -f gpload.yaml
2020-12-04 18:16:38|INFO|gpload session started 2020-12-04 18:16:38
2020-12-04 18:16:38|INFO|started gpfdist -p 8081 -P 8082 -f "/home/nparis/labevents.csv" -t 30
2020-12-04 18:17:49|INFO|running time: 71.18 seconds
2020-12-04 18:17:49|INFO|rows Inserted          = 27854055
2020-12-04 18:17:49|INFO|rows Updated           = 0
2020-12-04 18:17:49|INFO|data formatting errors = 0
2020-12-04 18:17:49|INFO|gpload succeeded

warehouse=# create table tbl_range (data1 text, data2 text, randomnumber double precision) with (appendonly=true, orientation=column, compresstype=ZLIB );
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'data1' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE
Time: 352,343 ms
warehouse=#                                                                                                                                    insert into tbl_range
( select
    ('{a,b,c,d,e,f}'::text[])[1 + floor(random() * 6)] as data1
    ,('{g,h,i,j,k,l}'::text[])[1 + floor(random() * 6)] as data2
    ,random() * 1000 as randomnumber
from generate_series(1,100000000)) ;
INSERT 0 100000000
Time: 145408,190 ms
warehouse=# SELECT madlib.pivot('tbl_range', 'pivout', 'data1', 'data2', 'randomnumber', 'sum,max');
 pivot
-------

(1 row)

Time: 67074,359 ms

create table tbl_range (data1 text, data2 text, randomnumber double precision) with (appendonly=true, orientation=column, compresstype=ZLIB );
insert into tbl_range
( select
    ('{a,b,c,d,e,f}'::text[])[1 + floor(random() * 6)] as data1
    ,('{g,h,i,j,k,l}'::text[])[1 + floor(random() * 6)] as data2
    ,random() * 1000 as randomnumber
from generate_series(1,400000000));
SELECT madlib.pivot('tbl_range', 'pivout', 'data1', 'data2', 'randomnumber', 'sum,max');

React ?

This page was last modified: 2021-11-07 11:53