"""
Updating table and column metadata.

Originally, this was done by writing into RDs, and the bulk of the code
still reflects that.

The problem here is that RDs typically are formatted with lots of love,
also within elements -- e.g., like this::

	<column name="bla" type="text"
		ucd="foo.bar"
		description="A long text carefully
			broken at the right place"
	/>

There's no way one can coax a normal XML parser into giving events that'd
allow us to preserve this formatting.   Hence, when manipulating
RD sources, I need something less sophisticated -- the dump XML parser
implemented here.

Except possibly for coverage (and even there I have my doubts) all this
has turned out to be a bad idea, best shown by the endless trouble it is
whith STREAMs.

We therefore store column metadata in dc.simple_col_stats (and perhaps
others in the future) starting in DaCHS 2.3.1 (schema version 27).
"""

#c Copyright 2008-2021, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.


from gavo import base
from gavo import rsc
from gavo.user import info


def iterCoverageItems(updater):
	"""yields coverage items for inclusion in RDs.

	NOTE: so far, we can only have one coverage item.  So, it's enough
	to just say "fill this into axis x of coverage".  If and when we
	have more than one coverage items, we'll have to re-think that.
	That's why there's the "reserved" value in the tuples.  We'll have to
	put something in there (presumably the index of the coverage element,
	but perhaps we'll have a better identity at some point).
	"""
	if updater is base.NotGiven:
		return

	if updater.parent.spatial is not None:
		sourceTable = updater.spaceTable or updater.sourceTable

		if sourceTable:
			cov = info.getMOCForStdTable(sourceTable, updater.mocOrder)
			if cov:
				yield "spatial", cov.asASCII()

	if updater.parent.temporal is not None:
		sourceTable = updater.timeTable or updater.sourceTable
		if sourceTable:
			res = []
			for pair in info.iterScalarLimits(
					sourceTable,
					info.getTimeLimitsExprs):
				res.extend(pair)
			yield "temporal", res

	if updater.parent.spectral is not None:
		sourceTable = updater.spectralTable or updater.sourceTable
		if sourceTable:
			res = []
			for pair in info.iterScalarLimits(
					sourceTable,
					info.getSpectralLimitsExprs):
				res.extend(pair)
			yield "spectral", res


def updateRDLevelMetadata(rd, conn):
	"""Determines RD-level metadata (coverage, mainly) and inserts it into
	dc.rds.
	"""
	conn.execute(
		"UPDATE dc.rdmeta"
		" SET spatial=NULL, temporal=NULL, spectral=NULL"
		" WHERE sourceRD=%(rdId)s",
		{"rdId": rd.sourceId})
		
	if rd.coverage:
		for colName, value in iterCoverageItems(rd.coverage.updater):
			conn.execute(
				"UPDATE dc.rdmeta"
				f" SET {colName}=%(value)s"
				" WHERE sourceRD=%(rdId)s",
				{"value": value, "rdId": rd.sourceId})


def updateTableLevelMetadata(
		td, 
		conn,
		samplePercent=None, 
		acquireColumnMeta=True):
	"""determines column metadata for the table td and inserts it into
	dc.*stats.

	samplePercent, if given, says how much of the table to look at; giving
	this on views will fail.

	If acquireColumnMeta is False, only the size of the table is estimated.
	"""
	if base.UnmanagedQuerier(conn).getTableType(td.getQName()) is None:
		base.ui.notifyWarning("Skipping non-existing table %s"%td.getQName())
		return

	info.annotateDBTable(td, samplePercent, acquireColumnMeta)

	acquiredRows = []
	for col in td:
		if col.type not in base.NUMERIC_TYPES:
			continue

		if hasattr(col, "annotations"):
			row = col.annotations.copy()
			if not row:
				continue
			row["tableName"] = td.getQName()
			row["column_name"] = col.name
			acquiredRows.append(row)

	conn.execute(
		"DELETE FROM dc.simple_col_stats WHERE tableName=%(tableName)s",
		{"tableName": td.getQName()})
	rsc.makeData(
		base.resolveCrossId("//dc_tables#import_simple_col_stats"),
		forceSource=acquiredRows,
		connection=conn)

	conn.execute("UPDATE dc.tablemeta SET nrows=%(nrows)s"
		" WHERE tableName=%(tableName)s",
		{"nrows": td.nrows, "tableName": td.getQName()})


def updateForRD(rd, conn, samplePercent=None, acquireColumnMeta=True):
	"""obtains RD- and table-level metadata for rd and writes it to
	the meta data tables through conn.
	"""
	base.ui.notifyInfo(f"Obtaining metadata for rd {rd.sourceId}...")
	updateRDLevelMetadata(rd, conn)
	for td in rd.tables:
		if td.onDisk: 
			if td.viewStatement and not td.getProperty("forceStats", False):
				continue

			updateTableLevelMetadata(
				td, conn, samplePercent, acquireColumnMeta)


def _getUpdatableRdIds():
	"""returns a list of RD ids that presumably had dachs limits
	run on them before (because they have coverage or table stats).
	"""
	with base.getTableConn() as conn:
		return [r[0] for r in conn.query(
			"SELECT sourceRD FROM"
			"	dc.rdmeta"
			"		WHERE spatial IS NOT NULL"
			"			OR temporal IS NOT NULL OR spectral IS NOT NULL"
			" UNION"
			"	SELECT sourceRD FROM"
			"		dc.tablemeta"
			"	WHERE nrows IS NOT NULL")]


def parseCmdLine():
	from argparse import ArgumentParser

	parser = ArgumentParser(
		description="Updates existing values min/max items in a referenced"
			" table or RD.")
	parser.add_argument("-t", "--tables-only", 
		dest="tablesOnly",
		action="store_true",
		help="Only acquire table/resource-level metadata (rather than column"
			" metadata, which usually takes a lot longer).")
	parser.add_argument("-s", "--sample-percent", type=float, default=None,
		dest="samplePercent", metavar="P",
		help="Only look at P percent of the table to determine min/max/mean.")
	parser.add_argument("itemId", help="Cross-RD reference of a table or"
		" RD to update, as in ds/q or ds/q#mytable; only RDs in inputsDir"
		" can be updated.  A single ALL will expand to all RDs that already"
		" have limits-obtained metadata.", nargs="+")
	return parser.parse_args()


def main():
	from gavo import api
	args = parseCmdLine()

	if len(args.itemId)==1 and args.itemId[0]=="ALL":
		args.itemId = _getUpdatableRdIds()

	with api.getWritableAdminConn() as conn:
		for itemRef in args.itemId:
			item = api.getReferencedElement(itemRef)

			if isinstance(item, api.TableDef):
				updateTableLevelMetadata(item, conn,
					args.samplePercent, not args.tablesOnly)

			elif isinstance(item, api.RD):
				updateForRD(item, conn, args.samplePercent, not args.tablesOnly)
				
			else:
				raise base.ReportableError(
					"%s references neither an RD nor a table definition"%args.itemId)

			conn.commit()
