muMLE/pattern_matching/planGraph.py

# coding: utf-8

"""
Author:		Sten Vercamman
			Univeristy of Antwerp

Example code for paper: Efficient model transformations for novices
url: http://msdl.cs.mcgill.ca/people/hv/teaching/MSBDesign/projects/Sten.Vercammen

The main goal of this code is to give an overview, and an understandable
implementation, of known techniques for pattern matching and solving the
sub-graph homomorphism problem. The presented techniques do not include
performance adaptations/optimizations. It is not optimized to be efficient
but rather for the ease of understanding the workings of the algorithms.
The paper does list some possible extensions/optimizations.

It is intended as a guideline, even for novices, and provides an in-depth look
at the workings behind various techniques for efficient pattern matching.
"""

from searchGraph import *
from enum import *

# Enum for all primitive operation types
# note: inc represent primitive operation in (as in is a reserved keyword in python)
PRIM_OP	= Enum(['lkp', 'inc', 'out', 'src', 'tgt'])

class PlanGraph(object):
	"""
	Holds the PlanGraph for a pattern.
	Can create the search plan of the pattern for a given SearchGraph.
	"""
	def __init__(self, pattern):
		if not isinstance(pattern, Graph):
			raise TypeError('PlanGraph expects the pattern to be a Graph')
		# member variables:
		self.vertices	= []	# will not be searched in
		self.edges		= []	# will not be searched in

		# representation map, maps vertex from pattern to element from PlanGraph
		# (no need for edges)
		repr_map		= {}

		# 1.1: for every vertex in the pattern graph,
		# create a vertex representing the pattern element
		for str_type, vertices in pattern.vertices.items():
			for vertex in vertices:
				# we only need to know the type of the vertex
				plan_vertex				= Vertex(str_type)
				# and we need to know that is was a vertex
				plan_vertex.is_vertex	= True
				# for re-linking the edges, we'll need to map the
				# vertex of the pattern to the plan_vertex
				repr_map[vertex]		= plan_vertex
				# save created plan_vertex
				self.vertices.append(plan_vertex)
		# 1.2: for every edge in the pattern graph,
		# create a vertex representing the pattern elemen
		for str_type, edges in pattern.edges.items():
			for edge in edges:
				# we only need to know the type of the edge
				plan_vertex	= Vertex(edge.type)
				# and we need to know that is was an edge
				plan_vertex.is_vertex	= False
				# save created plan_vertex
				self.vertices.append(plan_vertex)
				# 4: for every element x from the PlanGraph
				# that represents an edge e in the pattern:
				# 4.1: create an edge labelled tgt from x to the vertex in the PlanGraph
				# representing the target vertex of e in the pattern graph,
				# and a reverted edge labelled in
				# 4.1.1: tgt:
				plan_edge			= Edge(plan_vertex, repr_map[edge.tgt])
				# backup src and tgt (Edmonds might override it)
				plan_edge.orig_src	= plan_edge.src
				plan_edge.orig_tgt	= plan_edge.tgt
				plan_edge.label		= PRIM_OP.tgt
				# link vertices connected to this plan_edge
				plan_edge.src.addOutgoingEdge(plan_edge)
				plan_edge.tgt.addIncomingEdge(plan_edge)
				# tgt and src cost are always 1, we use logaritmic cost,
				# (=> cost = ln(1) = 0.0) so that we do not need to minimaze
				# a product, but can minimize a sum
				# (as ln(c1...ck) = ln(c1) + ... + ln (ck))
				plan_edge.cost		= 0.0
				# backup orig cost, as Edmonds changes cost
				plan_edge.orig_cost	= plan_edge.cost
				# save created edge
				self.edges.append(plan_edge)
				# 4.1.2: in:
				plan_edge			= Edge(repr_map[edge.tgt], plan_vertex)
				# backup src and tgt (Edmonds might override it)
				plan_edge.orig_src	= plan_edge.src
				plan_edge.orig_tgt	= plan_edge.tgt
				plan_edge.label		= PRIM_OP.inc
				# link vertices connected to this plan_edge
				plan_edge.src.addOutgoingEdge(plan_edge)
				plan_edge.tgt.addIncomingEdge(plan_edge)
				# save created edge
				self.edges.append(plan_edge)

				# 4.2: create an edge labelled src from x to the vertex in the PlanGraph
				# representing the source vertex of e in the pattern graph
				# and a reverted edge labelled out
				# 4.2.1: src
				plan_edge			= Edge(plan_vertex, repr_map[edge.src])
				# backup src and tgt (Edmonds might override it)
				plan_edge.orig_src	= plan_edge.src
				plan_edge.orig_tgt	= plan_edge.tgt
				plan_edge.label		= PRIM_OP.src
				# link vertices connected to this plan_edge
				plan_edge.src.addOutgoingEdge(plan_edge)
				plan_edge.tgt.addIncomingEdge(plan_edge)
				# tgt and src cost are always 1, we use logaritmic cost,
				# (=> cost = ln(1) = 0.0) so that we do not need to minimaze
				# a product, but can minimize a sum
				# (as ln(c1...ck) = ln(c1) + ... + ln (ck))
				plan_edge.cost		= 0.0
				# backup orig cost, as Edmonds changes cost
				plan_edge.orig_cost	= plan_edge.cost
				# save created edge
				self.edges.append(plan_edge)
				# 4.2.2: out
				plan_edge			= Edge(repr_map[edge.src], plan_vertex)
				# backup src and tgt (Edmonds might override it)
				plan_edge.orig_src	= plan_edge.src
				plan_edge.orig_tgt	= plan_edge.tgt
				plan_edge.label		= PRIM_OP.out
				# link vertices connected to this plan_edge
				plan_edge.src.addOutgoingEdge(plan_edge)
				plan_edge.tgt.addIncomingEdge(plan_edge)
				# save created edge
				self.edges.append(plan_edge)
		# 2: create a root vertex
		self.root	= Vertex('root')
		# don't add it to the vertices

		# 3: for each element in the PlanGraph (that is not the root vertex),
		# create an edge from the root to it, and label it lkp
		for vertex in self.vertices:
			plan_edge			= Edge(self.root, vertex)
			# backup src and tgt (Edmonds might override it)
			plan_edge.orig_src	= plan_edge.src
			plan_edge.orig_tgt	= plan_edge.tgt
			plan_edge.label		= PRIM_OP.lkp
			# link vertices connected to this plan_edge
			plan_edge.src.addOutgoingEdge(plan_edge)
			plan_edge.tgt.addIncomingEdge(plan_edge)
			# save created edge
			self.edges.append(plan_edge)

	def updatePlanCost(self, graph):
		"""
		returns True if sucessfully updated cost,
		returns False if a type in the pattern is not in the graph.
		"""
		if not isinstance(graph, SearchGraph):
			raise TypeError('updatePlanCost expects a SearchGraph')
		# update, lkp, in and out (not src and tgt as they are constant)

		for edge in self.edges:
			if edge.label == PRIM_OP.lkp:
				edge.cost	= graph.getCostLkp(edge.tgt.type, edge.tgt.is_vertex)
				if edge.cost == None:
					print('failed lkp')
					return False
			elif edge.label == PRIM_OP.inc:
				# in(v, e), binds an incoming edge e from an already bound vertex v,
				# depends on the number of incoming edges of type e for the vertex type
				edge.cost	= graph.getCostInc(edge.src.type, edge.tgt.type)
				if edge.cost == None:
					print('failed in')
					return False
			elif edge.label == PRIM_OP.out:
				# (analogue for out(v, e))
				edge.cost	= graph.getCostOut(edge.src.type, edge.tgt.type)
				if edge.cost == None:
					print('failed out')
					return False
			# else: ignore src and tgt
			# backup orig cost, as Edmonds changes cost
			edge.orig_cost	= edge.cost
		return True

	def Edmonds(self, searchGraph):
		"""
		Returns the minimum directed spanning tree (MDST)
		for the pattern and the provided graph.
		Returns None if it is impossible to find the pattern in the Graph
		(vertex type of edge type from pattern not in Graph).
		"""
		# update the cost for the PlanGraph
		if not self.updatePlanCost(searchGraph):
			print('type in pattern not found in Graph (in Edmonds)')
			# (returns False if a type in the pattern can not be found in the graph)
			return None
		# Complete Edmonds algorithm has optimization steps:
		# a: remove edges entering the root
		# b: merge parallel edges from same src to same tgt with mim weight
		# we can ignore this as:
		# a: the root does not have incoming edges
		# b: the PlanGraph does not have such paralllel edges

		# 1: for each node v (other than root), find incoming edge with lowest weight
		# insert those
		pi_v		= {}
		for plan_vertex in self.vertices:
			min_weight	= float('infinity')
			min_edge	= None
			for plan_edge in plan_vertex.incoming_edges:
				if plan_edge.cost < min_weight:
					min_weight	= plan_edge.cost
					min_edge	= plan_edge
			# save plan_vertex and it's minimum incoming edge
			pi_v[plan_vertex]	= min_edge
			if min_edge == None:
				raise RuntimeError('baka: no min_edge found')

		def getCycle(vertex, reverse_graph, visited):
			"""
			Walk from vertex to root, we walk in a reverse order, as each vertex
			only has one incoming edge, so we walk to the source of that incoming
			edge. We stop when we already visited a vertex we walked on.
			In both cases we return None.
			When we visit a vertex from our current path, we return that cycle,
			by first removing its tail.
			"""
			def addToVisited(walked, visited):
				for vertex in walked:
					visited.add(vertex)

			walked			= []	# we could only save it once, but we need order
			current_path	= set()	# and lookup in an array is slower than in set
			# we asume root is in visited (it must be in it)
			while vertex not in visited:
				if vertex in current_path:
					# we found a cycle, the cycle however might look like a: O--,
					# g f e			where we first visited a, then b, c, d,...
					# h   d c b a	k points back to d, completing a cycle,
					# i j k			but c b a is the tail that does not belong
					# in the cycle, removing this is "easy" as we know that
					# we first visited the tail, so they are the first elements
					# in our walked path
					for tail_part in walked:
						if tail_part != vertex:
							current_path.remove(tail_part)
						else:
							break

					addToVisited(walked, visited)
					return current_path
				current_path.add(vertex)
				walked.append(vertex)
				# by definition, an MDST only has one incoming edge per vertex
				# so we follow it upwards
				# vertex <--(minimal edge)-- src
				vertex	= reverse_graph[vertex].src

			# no cycle found (the current path let to a visited vertex)
			addToVisited(walked, visited)	# add walked to visited
			return None

		class VertexGraph(Vertex):
			"""
			Acts as a super vertex, holds a subgraph (that is/was once a cyle).
			Uses for Edmonds contractions step.
			The incoming edges are the edges leading to the vertices in the
			VertexGraph (they exclude edges from a vertex in the cycle to
			another vertex in the cycle).
			Analogue for outgoing edges.
			"""
			def __init__(self, cycle, reverseGraph):
				# Call parent class constructor
				str_type	= ''
				for vertex in cycle:
					str_type += str(vertex.type)
				Vertex.__init__(self, str_type)
				# member variables:
				self.internalMDST		= {}

				minIntWeight	= self.findMinIntWeight(cycle, reverseGraph)
				self.updateMinExtEdge(minIntWeight, reverseGraph)


			def findMinIntWeight(self, cycle, reverseGraph):
				"""
				Find the the smallest cost of the cycle his internal incoming edges.
				(Also save its internalMDST (currently a cycle).)
				(The VertexGraph formed by the cycle will be added to the
				reverseGraph by calling findMinExtEdge.)
				"""
				minIntWeight	= float('infinity')

				cycleEdges	= []
				origTgts	= []
				for cyclePart in cycle:
					cycleEdges.append(reverseGraph[cyclePart])
					origTgts.append(reverseGraph[cyclePart].orig_tgt)

				for vertex in cycle:
					# add incoming edges to this VertexGraph
					for inc_edge in vertex.incoming_edges:
						# edge from within the cycle
						if inc_edge.src in cycle:
							minIntWeight	= min(minIntWeight, inc_edge.cost)
						else:
							# edge from outside the cycle
							self.addIncomingEdge(inc_edge)
					# add outgoing edges to this VertexGraph
					for out_edge in vertex.outgoing_edges:
						if out_edge.tgt not in cycle:
							# edge leaves the cycle
							self.addOutgoingEdge(out_edge)
							# update src to this VertexGraph
							out_edge.src	= self
					# save internal MDST
					min_edge	= reverseGraph[vertex]
					if min_edge.src in cycle:
						self.internalMDST[vertex]	= min_edge
					else:
						raise TypeError('how is this a cycle')

				return minIntWeight

			def updateMinExtEdge(self, minIntWeight, reverseGraph):
				"""
				Modifies all external incoming edges their cost and finds the
				minimum external incoming edge with this modified weight.
				This found edge will break the cycle, update the internalMDST
				from a cycle to an MDST, updates the reverseGraph to include
				the vertexGraph.
				"""
				minExt			= None
				minModWeight	= -float('infinity')

				# Find incoming edge from outside of the circle with minimal
				# modified cost. This edge will break the cycle.
				for inc_edge in self.incoming_edges:
					# An incoming edge (with src from within the cycle), can be
					# from a contracted part of the graph. Assume bc is a
					# contracted part (VertexGraph) a, bc is a newly formed
					# cycle (due to the breaking of the previous cycle bc). bc
					# has at least lkp incoming edges to b and c, but we should
					# not consider the lkp of c to break the cycle.
					# If we want to break a, bc, select plausable edges,
					#  /<--\
					# a     bc   bc's MDST b <-- c
					#  \-->/
					# by looking at their original targets.
					# (if cycle inc_edge.orig_tgt == external inc_edge.orig_tgt)
					if reverseGraph[inc_edge.tgt].orig_tgt == inc_edge.orig_tgt:
						# modify costL cost of inc_edge -
						# (cost of previously choosen minimum edge to cycle vertex - minIntWeight)
						inc_edge.cost	-= (reverseGraph[inc_edge.tgt].cost - minIntWeight)
						if minExt is None or minModWeight > inc_edge.cost:
							# save better edge from outside of the cycle
							minExt			= inc_edge
							minModWeight	= inc_edge.cost

				# Example: a, b is a cycle (we know that there are no other
				# incoming edges to a and/or b, as there is on;y exactly one
				# incoming edge per vertex), and the arow from c to b represents
				# the minExt edge. We will remove the bottem arrow (from a to b)
				#  /<--\			and save the minExt edge in the reverseGraph.
				# a     b <-- c		This breaks the cycle. As the internalMDST
				#  \-->/			saves the intenal MDST, and currently still
				# holds a cycle, we have to remove it from the internalMDST.
				# We have to remove all vertex bindings of the cycle from the
				# reverseGraph (as it is contracted into a single VertexGraph),
				# and store the minExt edge to this VertexGraph in it.
				for int_vertex, _ in self.internalMDST.items():
					del reverseGraph[int_vertex]	# remove cycle from reverseGraph

				del self.internalMDST[minExt.tgt]	# remove/break cycle

				for inc_edge in self.incoming_edges:
					# update inc_edge's target to this VertexGraph
					inc_edge.tgt	= self

				# save minExt edge to this VertexGraph in the reverseGraph
				reverseGraph[self]	= minExt

		while True:
			# 2: find all cycles:
			cycles	= []
			visited	= set([self.root])		# root does not have incoming edges,
			for vertex in list(pi_v.keys()):		# it can not be part of a cycle
				if vertex not in visited:	# getCycle depends on root being in visited
					cycle	= getCycle(vertex, pi_v, visited)
					if cycle != None:
						cycles.append(cycle)

			# 2: if the set of edges {pi(v), v} does not contain any cycles,
			# Then we found our minimum directed spanning tree
			# otherwise, we'll have to resolve the cycles
			if len(cycles) == 0:
				break

			# 3: For each formed cycle:
			# 3a: find internal incoming edge with the smallest cost
			# 3b: modify the cost of each arc which enters the cycle
			# 3c: replace smallert internal edge with the modified edge which has the smallest cost
			for cycle in cycles:
				# Breaks a cycle by:
				# - contracting cycle into VertexGraph
				# - finding the internal incoming edge with the smallest cost
				# - modify the cost of each arc which enters the cycle
				# - replacing the smallest internal edge with the modified edge which has the smallest cost
				# - changing reverseGraph accordingly (removes elements from cycle, ads vertexGraph)
				# (This will find a solution as the graph keeps shrinking with every cycle,
				# in the worst case the same amount as there are vertices, until
				# onlty the root and one vertexGraph remains)
				vertexGraph	= VertexGraph(cycle, pi_v)

		class SortedContainer(object):
			"""
			A container that keeps elemets sorted based on a given sortValue.
			Elements with the same value, will be returned in the order they got inserted.
			"""
			def __init__(self):
				# member variables:
				self.keys	= []	# stores key in sorted order (sorted when pop gets called)
				self.sorted	= {}	# {key, [elems with same key]}

			def add(self, sortValue, element):
				"""
				Adds element with sortValue to the SortedContainer.
				"""
				elems	= self.sorted.get(sortValue)
				if elems == None:
					self.sorted[sortValue]	= [element]
					self.keys.append(sortValue)
				else:
					elems.append(element)

			def pop(self):
				"""
				Sorts the SortedContainer, returns element with smallest sortValue.
				"""
				self.keys.sort()
				elems	= self.sorted[self.keys[0]]
				elem	= elems.pop()
				if len(elems) == 0:
					del self.sorted[self.keys[0]]
					del self.keys[0]
				return elem

			def empty(self):
				"""
				Returns whether or not the sorted container is empty.
				"""
				return (len(self.keys) == 0)

		def createPRIM_OP(edge, inc_cost=True):
			"""
			Helper function to keep argument list short,
			return contracted data for a PRIM_OP.
			"""
			if edge.label == PRIM_OP.inc or edge.label == PRIM_OP.out:
				if inc_cost: # op		# vertex type		# actual edge type
					return (edge.label, edge.orig_src.type, edge.orig_tgt.type, edge.cost)
				else:
					return (edge.label, edge.orig_src.type, edge.orig_tgt.type)
			elif edge.label == PRIM_OP.lkp:
				if inc_cost: # op		# vertex/edge type	# is vertex or edge
					return (edge.label, edge.orig_tgt.type, edge.orig_tgt.is_vertex, edge.cost)
				else:
					return (edge.label, edge.orig_tgt.type, edge.orig_tgt.is_vertex)
			else:	# src, tgt operation
				if inc_cost: # op		# actual edge type
					return (edge.label, edge.orig_src.type, edge.cost)
				else:
					return (edge.label, edge.orig_src.type)

		def flattenReverseGraph(vertex, inc_edge, reverseGraph):
			"""
			Flattens the reverseGraph, so that the vertexGraph node can get
			processed to create a forwardGraph.
			"""
			if not isinstance(vertex, VertexGraph):
				reverseGraph[vertex]	= inc_edge
			else:
				reverseGraph[inc_edge.orig_tgt]	= inc_edge
				for vg, eg in inc_edge.tgt.internalMDST.items():
					flattenReverseGraph(vg, eg, reverseGraph)
			if isinstance(inc_edge.src, VertexGraph):
				for vg, eg in inc_edge.src.internalMDST.items():
					flattenReverseGraph(vg, eg, reverseGraph)

		def createForwardGraph(vertex, inc_edge, forwardGraph):
			"""
			Create a forwardGraph, keeping in mind that their can be vertexGraph
			in the reverseGraph.
			"""
			if not isinstance(vertex, VertexGraph):
				forwardGraph.setdefault(inc_edge.orig_src, []).append(inc_edge)
			else:
				forwardGraph.setdefault(inc_edge.orig_src, []).append(inc_edge)
				for vg, eg in vertex.internalMDST.items():
					createForwardGraph(vg, eg, forwardGraph)

		MDST	= []
		# pi_v contains {vertex, incoming_edge}
		# we want to start from root and follow the outgoing edges
		# so we have to build the forwardGraph graph for pi_v
		# (Except for the root (has 0), each vertex has exactly one incoming edge,
		# but might have multiple outgoing edges)
		forwardGraph	= {}	# {vertex, [outgoing edge 1, ... ] }
		reverseGraph	= {}

		# flatten reverseGraph (for the vertexGraph elements)
		for v, e in pi_v.items():
			flattenReverseGraph(v, e, reverseGraph)

		# create the forwardGraph
		for vertex, edge in reverseGraph.items():
			createForwardGraph(vertex, edge, forwardGraph)

		# create the MDST in a best first manner (lowest value first)
		current		= SortedContainer()		# allows easy walking true tree
		for edge in forwardGraph[self.root]:
			current.add(edge.orig_cost, edge)	# use orig cost, not modified
		while current.empty() != True:
			p_op	= current.pop()				# p_op contains an outgoing edge
			MDST.append(createPRIM_OP(p_op))
			for edge in forwardGraph.get(p_op.orig_tgt, []):
				current.add(edge.orig_cost, edge)
		return MDST