2025
Petrini, Sonia; Ferrer-i-Cancho, Ramon
The distribution of syntactic dependency distances Journal Article
In: Glottometrics, vol. 58, pp. 35-94, 2025.
Abstract | Links | BibTeX | Tags: dependency syntax, network science, word order
@article{Petrini2022c,
title = {The distribution of syntactic dependency distances},
author = {Sonia Petrini and Ramon Ferrer-i-Cancho},
url = {https://arxiv.org/abs/2211.14620},
doi = {10.53482/2025_58_424},
year = {2025},
date = {2025-01-01},
journal = {Glottometrics},
volume = {58},
pages = {35-94},
abstract = {The syntactic structure of a sentence can be represented as a graph, where vertices are words and edges indicate syntactic dependencies between them. In this setting, the distance between two linked words is defined as the difference between their positions. Here we wish to contribute to the characterization of the actual distribution of syntactic dependency distances, which has previously been argued to follow a power-law distribution. Here we propose a new model with two exponential regimes in which the probability decay is allowed to change after a break-point. This transition could mirror the transition from the processing of word chunks to higher-level structures. We find that a two-regime model - where the first regime follows either an exponential or a power-law decay - is the most likely one in all 20 languages we considered, independently of sentence length and annotation style. Moreover, the break-point exhibits low variation across languages and averages values of 4-5 words, suggesting that the amount of words that can be simultaneously processed abstracts from the specific language to a high degree. The probability decay slows down after the breakpoint, consistently with a universal chunk-and-pass mechanism. Finally, we give an account of the relation between the best estimated model and the closeness of syntactic dependencies as function of sentence length, according to a recently introduced optimality score.},
keywords = {dependency syntax, network science, word order},
pubstate = {published},
tppubtype = {article}
}
2024
Alemany-Puig, L.; Esteban, J. L.; Ferrer-i-Cancho, R.
The Maximum Linear Arrangement Problem for trees under projectivity and planarity Journal Article
In: Information Processing Letters, vol. 183, pp. 106400, 2024.
Abstract | Links | BibTeX | Tags: dependency syntax, network science, word order
@article{Alemany2022b,
title = {The Maximum Linear Arrangement Problem for trees under projectivity and planarity},
author = {L. Alemany-Puig and J. L. Esteban and R. Ferrer-i-Cancho},
url = {https://arxiv.org/abs/2206.06924},
doi = {10.1016/j.ipl.2023.106400},
year = {2024},
date = {2024-01-01},
journal = {Information Processing Letters},
volume = {183},
pages = {106400},
abstract = {The Maximum Linear Arrangement problem (MaxLA) consists of finding a mapping π from the n vertices of a graph G to distinct consecutive integers that maximizes the sum of edge lengths. In this setting, vertices are considered to lie on a horizontal line and edges are drawn as semicircles above the line. There exist variants of MaxLA in which the arrangements are constrained. In the planar variant edge crossings are forbidden. In the projective variant for rooted trees arrangements are planar and the root cannot be covered by any edge. Here we present O(n)-time and O(n)-space algorithms that solve Planar and Projective MaxLA for trees. We also prove several properties of maximum projective and planar arrangements.},
keywords = {dependency syntax, network science, word order},
pubstate = {published},
tppubtype = {article}
}
Alemany-Puig, L.; Ferrer-i-Cancho, R.
The expected sum of edge lengths in planar linearizations of trees Journal Article
In: Journal of Language Modelling, vol. 12, no. 1, pp. 1-42, 2024.
Abstract | Links | BibTeX | Tags: dependency syntax, network science, word order
@article{Alemany2022c,
title = {The expected sum of edge lengths in planar linearizations of trees},
author = {L. Alemany-Puig and R. Ferrer-i-Cancho},
url = {https://arxiv.org/abs/2207.05564},
doi = {10.15398/jlm.v12i1.362},
year = {2024},
date = {2024-01-01},
journal = {Journal of Language Modelling},
volume = {12},
number = {1},
pages = {1-42},
abstract = {Dependency graphs have proven to be a very successful model to represent the syntactic structure of sentences of human languages. In these graphs, widely accepted to be trees, vertices are words and arcs connect syntactically-dependent words. The tendency of these dependencies to be short has been demonstrated using random baselines for the sum of the lengths of the edges or its variants. A ubiquitous baseline is the expected sum in projective orderings (wherein edges do not cross and the root word of the sentence is not covered by any edge). It was shown that said expected value can be computed in O(n) time. In this article we focus on planar orderings (where the root word can be covered) and present two main results. First, we show the relationship between the expected sum in planar arrangements and the expected sum in projective arrangements. Second, we also derive a O(n)-time algorithm to calculate the expected value of the sum of edge lengths. These two results stem from another contribution of the present article, namely a characterization of planarity that, given a sentence, yields either the number of planar permutations or an efficient algorithm to generate uniformly random planar permutations of the words. Our research paves the way for replicating past research on dependency distance minimization using random planar linearizations as random baseline.},
keywords = {dependency syntax, network science, word order},
pubstate = {published},
tppubtype = {article}
}
2023
Alemany-Puig, L.; Esteban, J. L.; Ferrer-i-Cancho, R.
On the maximum linear arrangement problem for trees Journal Article
In: 2023.
Abstract | Links | BibTeX | Tags: dependency syntax, network science, word order
@article{Alemany2023a,
title = {On the maximum linear arrangement problem for trees},
author = {L. Alemany-Puig and J. L. Esteban and R. Ferrer-i-Cancho},
url = {https://arxiv.org/abs/2312.04487},
year = {2023},
date = {2023-01-01},
abstract = {Linear arrangements of graphs are a well-known type of graph labeling and are found at the heart of many important computational problems, such as the Minimum Linear Arrangement Problem (minLA). A linear arrangement is usually defined as a permutation of the n vertices of a graph. An intuitive geometric setting is that of vertices lying on consecutive integer positions in the real line, starting at 1; edges are typically drawn as semicircles above the real line. In this paper we study the Maximum Linear Arrangement problem (MaxLA), the maximization variant of minLA and a less studied problem than minLA. We a devise new characterization of maximum arrangements of general graphs, and prove that MaxLA can be solved for cycle graphs in constant time, and for k-linear trees (k≤2) in time O(n). We present a simple algorithm that solves a constrained variant of MaxLA, which we call bipartite MaxLA, in time O(n). This algorithm has two promising characteristics. First, it solves MaxLA for most trees consisting of a few tenths of nodes. Second, it produces a high quality approximation to MaxLA for trees where the algorithm fails to solve MaxLA. Furthermore, we conjecture this algorithm solves MaxLA for at least 50% of all free trees.},
keywords = {dependency syntax, network science, word order},
pubstate = {published},
tppubtype = {article}
}
2022
Alemany-Puig, Lluís; Esteban, Juan L.; Ferrer-i-Cancho, Ramon
The Linear Arrangement Library. A new tool for research on syntactic dependency structures Proceedings Article
In: Proceedings of the Second Workshop on Quantitative Syntax (Quasy, SyntaxFest 2021), pp. 1-16, Association for Computational Linguistics, Sofia, Bulgaria, 2022.
Abstract | Links | BibTeX | Tags: dependency syntax, network science, word order
@inproceedings{Alemany2022a,
title = {The Linear Arrangement Library. A new tool for research on syntactic dependency structures},
author = {Lluís Alemany-Puig and Juan L. Esteban and Ramon Ferrer-i-Cancho},
url = {https://arxiv.org/abs/2112.02512},
year = {2022},
date = {2022-01-01},
booktitle = {Proceedings of the Second Workshop on Quantitative Syntax (Quasy, SyntaxFest 2021)},
pages = {1-16},
publisher = {Association for Computational Linguistics},
address = {Sofia, Bulgaria},
abstract = {The new and growing field of Quantitative Dependency Syntax has emerged at the crossroads between Dependency Syntax and Quantitative Linguistics. One of the main concerns in this field is the statistical patterns of syntactic dependency structures. These structures, grouped in treebanks, are the source for statistical analyses in these and related areas; dozens of scores devised over the years are the tools of a new industry to search for patterns and perform other sorts of analyses. The plethora of such metrics and their increasing complexity require sharing the source code of the programs used to perform such analyses. However, such code is not often shared with the scientific community or is tested following unknown standards. Here we present a new open-source tool, the Linear Arrangement Library (LAL), which caters to the needs of, especially, inexperienced programmers. This tool enables the calculation of these metrics on single syntactic dependency structures, treebanks, and collection of treebanks, grounded on ease of use and yet with great flexibility. LAL has been designed to be efficient, easy to use (while satisfying the needs of all levels of programming expertise), reliable (thanks to thorough testing), and to unite research from different traditions, geographic areas, and research fields.},
keywords = {dependency syntax, network science, word order},
pubstate = {published},
tppubtype = {inproceedings}
}
Ferrer-i-Cancho, Ramon; Gómez-Rodríguez, Carlos
Dependency distance minimization predicts compression Proceedings Article
In: Proceedings of the Second Workshop on Quantitative Syntax (Quasy, SyntaxFest 2021), pp. 45-57, Association for Computational Linguistics, Sofia, Bulgaria, 2022.
Abstract | Links | BibTeX | Tags: dependency syntax, theory construction
@inproceedings{Ferrer2021a,
title = {Dependency distance minimization predicts compression},
author = {Ramon Ferrer-i-Cancho and Carlos Gómez-Rodríguez},
url = {https://arxiv.org/abs/2109.08900},
year = {2022},
date = {2022-01-01},
booktitle = {Proceedings of the Second Workshop on Quantitative Syntax (Quasy, SyntaxFest 2021)},
pages = {45-57},
publisher = {Association for Computational Linguistics},
address = {Sofia, Bulgaria},
abstract = {Dependency distance minimization (DDm) is a well-established principle of word order. It has been predicted theoretically that DDm implies compression, namely the minimization of word lengths. This is a second order prediction because it links a principle with another principle, rather than a principle and a manifestation as in a first order prediction. Here we test that second order prediction with a parallel collection of treebanks controlling for annotation style with Universal Dependencies and Surface-Syntactic Universal Dependencies. To test it, we use a recently introduced score that has many mathematical and statistical advantages with respect to the widely used sum of dependency distances. We find that the prediction is confirmed by the new score when word lengths are measured in phonemes, independently of the annotation style, but not when word lengths are measured in syllables. In contrast, one of the most widely used scores, i.e. the sum of dependency distances, fails to confirm that prediction, showing the weakness of raw dependency distances for research on word order. Finally, our findings expand the theory of natural communication by linking two distinct levels of organization, namely syntax (word order) and word internal structure.},
keywords = {dependency syntax, theory construction},
pubstate = {published},
tppubtype = {inproceedings}
}
Alemany-Puig, L.; Ferrer-i-Cancho, R.
Linear-time calculation of the expected sum of edge lengths in random projective linearizations of trees Journal Article
In: Journal of Computational Linguistics, vol. 48, no. 3, pp. 491–516, 2022.
Abstract | Links | BibTeX | Tags: dependency syntax, network science, word order
@article{Alemany2021b,
title = {Linear-time calculation of the expected sum of edge lengths in random projective linearizations of trees},
author = {L. Alemany-Puig and R. Ferrer-i-Cancho},
url = {https://arxiv.org/abs/2107.03277},
doi = {10.1162/coli_a_00442},
year = {2022},
date = {2022-01-01},
journal = {Journal of Computational Linguistics},
volume = {48},
number = {3},
pages = {491–516},
abstract = {The syntactic structure of a sentence is often represented using syntactic dependency trees. The sum of the distances between syntactically related words has been in the limelight for the past decades. Research on dependency distances led to the formulation of the principle of dependency distance minimization whereby words in sentences are ordered so as to minimize that sum. Numerous random baselines have been defined to carry out related quantitative studies on languages. The simplest random baseline is the expected value of the sum in unconstrained random permutations of the words in the sentence, namely when all the shufflings of the words of a sentence are allowed and equally likely. Here we focus on a popular baseline: random projective permutations of the words of the sentence, that is, permutations where the syntactic dependency structure is projective, a formal constraint that sentences satisfy often in languages. Thus far, the expectation of the sum of dependency distances in random projective shufflings of a sentence has been estimated approximately with a Monte Carlo procedure whose cost is of the order of Zn, where n is the number of words of the sentence and Z is the number of samples; the larger Z, the lower the error of the estimation but the larger the time cost. Here we present formulae to compute that expectation without error in time of the order of n. Furthermore, we show that star trees maximize it, and devise a dynamic programming algorithm to retrieve the trees that minimize it.},
keywords = {dependency syntax, network science, word order},
pubstate = {published},
tppubtype = {article}
}
Gómez-Rodríguez, C.; Christiansen, M. H.; Ferrer-i-Cancho, R.
Memory limitations are hidden in grammar Journal Article
In: Glottometrics, vol. 52, pp. 39 – 64, 2022.
Abstract | Links | BibTeX | Tags: dependency syntax, network science, word order
@article{Gomez2019a,
title = {Memory limitations are hidden in grammar},
author = {C. Gómez-Rodríguez and M. H. Christiansen and R. Ferrer-i-Cancho},
url = {https://arxiv.org/abs/1908.06629},
doi = {10.53482/2022_52_397},
year = {2022},
date = {2022-01-01},
journal = {Glottometrics},
volume = {52},
pages = {39 – 64},
abstract = {The ability to produce and understand an unlimited number of different sentences is a hallmark of human language. Linguists have sought to define the essence of this generative capacity using formal grammars that describe the syntactic dependencies between constituents, independent of the computational limitations of the human brain. Here, we evaluate this independence assumption by sampling sentences uniformly from the space of possible syntactic structures. We find that the average dependency distance between syntactically related words, a proxy for memory limitations, is less than expected by chance in a collection of state-of-the-art classes of dependency grammars. Our findings indicate that memory limitations have permeated grammatical descriptions, suggesting that it may be impossible to build a parsimonious theory of human linguistic productivity independent of non-linguistic cognitive constraints.},
keywords = {dependency syntax, network science, word order},
pubstate = {published},
tppubtype = {article}
}
In case the fancy publication browser above fails, you can also try.