Skip to content

Rewrite of edit_distance with edge costs. fix #111 #137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 176 additions & 28 deletions src/editdist.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,16 @@ representing vertex operations:


### Optional Arguments
- `insert_cost::Function=v->1.0`
- `delete_cost::Function=u->1.0`
- `subst_cost::Function=(u,v)->0.5`
- `vertex_insert_cost::Function=v->0.`
- `vertex_delete_cost::Function=u->0.`
- `vertex_subst_cost::Function=(u, v)->0.`
- `edge_insert_cost::Function=e->1.`
- `edge_delete_cost::Function=e->1.`
- `edge_subst_cost::Function=(e1, e2)->0.`

The algorithm will always try to match two edges if it can, so if it is
preferrable to delete two edges rather than match these, it should be
reflected in the `edge_subst_cost` function.

By default, the algorithm uses constant operation costs. The
user can provide classical Minkowski costs computed from vertex
Expand All @@ -31,7 +38,7 @@ search in case the default heuristic is not satisfactory.
- Given two graphs ``|G₁| < |G₂|``, `edit_distance(G₁, G₂)` is faster to
compute than `edit_distance(G₂, G₁)`. Consider swapping the arguments
if involved costs are equivalent.
- The use of simple Minkowski costs can improve performance considerably.
- The use of a heuristic can improve performance considerably.
- Exploit vertex attributes when designing operation costs.

### References
Expand All @@ -49,51 +56,163 @@ julia> g1 = SimpleDiGraph([0 1 0 0 0; 0 0 1 0 0; 1 0 0 1 0; 0 0 0 0 1; 0 0 0 1 0
julia> g2 = SimpleDiGraph([0 1 0; 0 0 1; 1 0 0]);

julia> edit_distance(g1, g2)
(3.5, Tuple[(1, 2), (2, 1), (3, 0), (4, 3), (5, 0)])
(3.0, Tuple[(1, 3), (2, 1), (3, 2), (4, 0), (5, 0)])
```
"""
function edit_distance(
G₁::AbstractGraph,
G₂::AbstractGraph;
insert_cost::Function=v -> 1.0,
delete_cost::Function=u -> 1.0,
subst_cost::Function=(u, v) -> 0.5,
heuristic::Function=DefaultEditHeuristic,
vertex_insert_cost=nothing,
vertex_delete_cost=nothing,
vertex_subst_cost=nothing,
edge_insert_cost=nothing,
edge_delete_cost=nothing,
edge_subst_cost=nothing,
heuristic=nothing,
)
if isnothing(vertex_insert_cost) &&
isnothing(vertex_delete_cost) &&
isnothing(vertex_subst_cost) &&
isnothing(edge_insert_cost) &&
isnothing(edge_delete_cost) &&
isnothing(edge_subst_cost) &&
isnothing(heuristic)
heuristic = default_edit_heuristic
end
vertex_insert_cost = something(vertex_insert_cost, v -> 0.0)
vertex_delete_cost = something(vertex_delete_cost, v -> 0.0)
vertex_subst_cost = something(vertex_subst_cost, (u, v) -> 0.0)
edge_insert_cost = something(edge_insert_cost, e -> 1.0)
edge_delete_cost = something(edge_delete_cost, e -> 1.0)
edge_subst_cost = something(edge_subst_cost, (e1, e2) -> 0.0)
heuristic = something(heuristic, (λ, G₁, G₂) -> 0.0)
return _edit_distance(
G₁::AbstractGraph,
G₂::AbstractGraph,
vertex_insert_cost,
vertex_delete_cost,
vertex_subst_cost,
edge_insert_cost,
edge_delete_cost,
edge_subst_cost,
heuristic,
)
end

function _edit_distance(
G₁::AbstractGraph{T},
G₂::AbstractGraph{U},
vertex_insert_cost::Function,
vertex_delete_cost::Function,
vertex_subst_cost::Function,
edge_insert_cost::Function,
edge_delete_cost::Function,
edge_subst_cost::Function,
heuristic::Function,
) where {T<:Integer,U<:Integer}
isdirected = is_directed(G₁) || is_directed(G₂)

# compute the cost on edges due to associate u1 to v1 and u2 to v2
# u2 and v2 can eventually be 0
function association_cost(u1, u2, v1, v2)
cost = 0.0
if has_edge(G₁, u1, u2)
if has_edge(G₂, v1, v2)
cost += edge_subst_cost(Edge(u1, u2), Edge(v1, v2))
else
cost += edge_delete_cost(Edge(u1, u2))
end
else
if has_edge(G₂, v1, v2)
cost += edge_insert_cost(Edge(v1, v2))
end
end
if isdirected && u1 != u2
if has_edge(G₁, u2, u1)
if has_edge(G₂, v2, v1)
cost += edge_subst_cost(Edge(u2, u1), Edge(v2, v1))
else
cost += edge_delete_cost(Edge(u2, u1))
end
else
if has_edge(G₂, v2, v1)
cost += edge_insert_cost(Edge(v2, v1))
end
end
end
return cost
end

# A* search heuristic
h(λ) = heuristic(λ, G₁, G₂)

# initialize open set
OPEN = PriorityQueue{Vector{Tuple},Float64}()
for v in 1:nv(G₂)
enqueue!(OPEN, [(1, v)], subst_cost(1, v) + h([(1, v)]))
for v in vertices(G₂)
enqueue!(OPEN, [(T(1), v)], vertex_subst_cost(1, v) + h([(T(1), v)]))
end
enqueue!(OPEN, [(1, 0)], delete_cost(1) + h([(1, 0)]))
enqueue!(OPEN, [(T(1), U(0))], vertex_delete_cost(1) + h([(T(1), U(0))]))

c = 0
while true
# minimum (partial) edit path
λ, cost = peek(OPEN)
c += 1
dequeue!(OPEN)

if is_complete_path(λ, G₁, G₂)
return cost, λ
else
k, _ = λ[end]
vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
u1, _ = λ[end]
u1 += T(1)
vs = setdiff(vertices(G₂), [v for (u, v) in λ])

if k < nv(G₁) # there are still vertices to process in G₁?
for v in vs
λ⁺ = [λ; (k + 1, v)]
enqueue!(OPEN, λ⁺, cost + subst_cost(k + 1, v) + h(λ⁺) - h(λ))
if u1 <= nv(G₁) # there are still vertices to process in G₁?
# we try every possible assignment of v1
for v1 in vs
λ⁺ = [λ; (u1, v1)]
new_cost = cost + vertex_subst_cost(u1, v1) + h(λ⁺) - h(λ)
for (u2, v2) in λ
new_cost += association_cost(u1, u2, v1, v2)
end
new_cost += association_cost(u1, u1, v1, v1) # handle self-loops

enqueue!(OPEN, λ⁺, new_cost)
end
# we try deleting v1
λ⁺ = [λ; (u1, U(0))]
new_cost = cost + vertex_delete_cost(u1) + h(λ⁺) - h(λ)
for u2 in outneighbors(G₁, u1)
# edges deleted later when assigning v2
u2 > u1 && continue
new_cost += edge_delete_cost(Edge(u1, u2))
end
λ⁺ = [λ; (k + 1, 0)]
enqueue!(OPEN, λ⁺, cost + delete_cost(k + 1) + h(λ⁺) - h(λ))
if isdirected
for u2 in inneighbors(G₁, u1)
# edges deleted later when assigning v2, and we should not count a self loop twice
u2 >= u1 && continue
new_cost += edge_delete_cost(Edge(u2, u1))
end
end
enqueue!(OPEN, λ⁺, new_cost)
else
# add remaining vertices of G₂ to the path
λ⁺ = [λ; [(0, v) for v in vs]]
total_insert_cost = sum(insert_cost, vs)
enqueue!(OPEN, λ⁺, cost + total_insert_cost + h(λ⁺) - h(λ))
# add remaining vertices of G₂ to the path by deleting them
λ⁺ = [λ; [(T(0), v) for v in vs]]
new_cost = cost + sum(vertex_insert_cost, vs)
for v1 in vs
for v2 in outneighbors(G₂, v1)
(v2 > v1 && v2 in vs) && continue # these edges will be deleted later
new_cost += edge_insert_cost(Edge(v1, v2))
end
if isdirected
for v2 in inneighbors(G₂, v1)
(v2 > v1 && v2 in vs) && continue # these edges will be deleted later
v1 == v2 && continue # we should not count a self loop twice
new_cost += edge_insert_cost(Edge(v2, v1))
end
end
end
enqueue!(OPEN, λ⁺, new_cost + h(λ⁺) - h(λ))
end
end
end
Expand All @@ -112,11 +231,40 @@ function is_complete_path(λ, G₁, G₂)
return length(us) == nv(G₁) && length(vs) == nv(G₂)
end

function DefaultEditHeuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
vs = Set([v for (u, v) in λ])
delete!(vs, 0)
# edit_distance(G₁::AbstractGraph, G₂::AbstractGraph) =
# edit_distance(G₁, G₂,
# vertex_insert_cost=v -> 0.,
# vertex_delete_cost=u -> 0.,
# vertex_subst_cost=(u, v) -> 0.,
# edge_insert_cost=e -> 1.,
# edge_delete_cost=e -> 1.,
# edge_subst_cost=(e1, e2) -> 0.,
# heuristic=default_edit_heuristic)

return nv(G₂) - length(vs)
"""
compute an upper bound on the number of edges that can still be affected
"""
function default_edit_heuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
us = setdiff(1:nv(G₁), [u for (u, v) in λ])
vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
total_free_edges_g1 = 0
total_free_edges_g2 = 0
if !isempty(us)
total_free_edges_g1 = sum(u -> outdegree(G₁, u), us)
end
if !isempty(vs)
total_free_edges_g2 = sum(v -> outdegree(G₂, v), vs)
end
for (u1, v1) in λ
(u1 == 0 || v1 == 0) && continue
total_free_edges_g1 += count(u2 -> u2 in us, outneighbors(G₁, u1))
total_free_edges_g2 += count(v2 -> v2 in vs, outneighbors(G₂, v1))
end
if !is_directed(G₁) && !is_directed(G₂)
total_free_edges_g1 = total_free_edges_g1 / 2
total_free_edges_g2 = total_free_edges_g2 / 2
end
return abs(total_free_edges_g1 - total_free_edges_g2)
end

#-------------------------
Expand Down
71 changes: 34 additions & 37 deletions test/edit_distance.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,48 @@
gquad = random_regular_graph(4, 2; rng=rng)
gpent = random_regular_graph(5, 2; rng=rng)

@testset "edit_distance $triangle, $quadrangle, $pentagon" for triangle in
testgraphs(gtri),
quadrangle in testgraphs(gquad),
pentagon in testgraphs(gpent)
g1 = star_graph(4)
g2 = cycle_graph(3)

d, λ = @inferred(
edit_distance(triangle, quadrangle, subst_cost=MinkowskiCost(1:3, 1:4))
)
@test d == 1.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4)]

d, λ = @inferred(
edit_distance(quadrangle, triangle, subst_cost=MinkowskiCost(1:4, 1:3))
)
@test d == 1.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0)]
vertex_insert_cost = v -> 1.0
vertex_delete_cost = v -> 2.0
vertex_subst_cost = (u, v) -> 3.0
edge_insert_cost = e -> 4.0
edge_delete_cost = e -> 5.0
edge_subst_cost = (e1, e2) -> 6.0

d, λ = @inferred(
edit_distance(triangle, pentagon, subst_cost=MinkowskiCost(1:3, 1:5))
)
@testset "undirected edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2)
d, λ = @inferred(edit_distance(G1, G2))
@test d == 2.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4), (0, 5)]

d, λ = @inferred(
edit_distance(pentagon, triangle, subst_cost=MinkowskiCost(1:5, 1:3))
edit_distance(
G1,
G2,
vertex_insert_cost=vertex_insert_cost,
vertex_delete_cost=vertex_delete_cost,
vertex_subst_cost=vertex_subst_cost,
edge_insert_cost=edge_insert_cost,
edge_delete_cost=edge_delete_cost,
edge_subst_cost=edge_subst_cost,
)
)
@test d == 2.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0), (5, 0)]
# 1 vertex deletion, 3 vertex substitution, 1 edge insertio n, 1 edge deletion, 2 edge substitution
@test d == 32.0
end

@testset "Minkowski cost / bounded Minkowski" begin
cost = @inferred(MinkowskiCost(1:3, 1:3))
bcost = @inferred(BoundedMinkowskiCost(1:3, 1:3))
for i in 1:3
@test cost(i, i) == 0.0
@test bcost(i, i) == 2 / 3
end
g1 = DiGraph(4)
edges = [(1, 2), (1, 4), (2, 3), (3, 1), (3, 4), (4, 1), (1, 1), (4, 4)]
for e in edges
add_edge!(g1, e)
end
g2 = DiGraph(4)
edges = [(2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (2, 2), (3, 3)]
for e in edges
add_edge!(g2, e)
end

g1c = complete_graph(4)
g2c = complete_graph(4)
rem_edge!(g2c, 1, 2)
@testset "edit_distance $g1, $g2" for g1 in testgraphs(g1c), g2 in testgraphs(g2c)
d, λ = @inferred(edit_distance(g1, g2))
@test d == 2.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 4)]
@testset "directed edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2)
d, λ = @inferred(edit_distance(G1, G2))
@test d == 4.0
end
end